use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class StandardizeInvalidTest method should_accept_column.
@Test
public void should_accept_column() {
// a column with semantic
SemanticCategoryEnum semantic = SemanticCategoryEnum.COUNTRY;
List<SemanticDomain> semanticDomainLs = new ArrayList<>();
semanticDomainLs.add(new SemanticDomain("COUNTRY", "Country", 0.85f));
ColumnMetadata column = ColumnMetadata.Builder.column().id(0).name("name").type(Type.STRING).semanticDomains(semanticDomainLs).domain(semantic.name()).build();
assertTrue(action.acceptField(column));
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class DataSetServiceTest method updateDatasetColumn_should_update_domain.
@Test
public void updateDatasetColumn_should_update_domain() throws Exception {
// given
final String dataSetId = //
given().body(//
IOUtils.toString(this.getClass().getResourceAsStream(TAGADA_CSV), UTF_8)).queryParam(CONTENT_TYPE, //
"text/csv").when().post(//
"/datasets").asString();
final ColumnMetadata column;
// update the metadata in the repository (lock mechanism is needed otherwise semantic domain will be erased by
// analysis)
final DistributedLock lock = dataSetMetadataRepository.createDatasetMetadataLock(dataSetId);
DataSetMetadata dataSetMetadata;
RowMetadata row;
lock.lock();
try {
dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
assertNotNull(dataSetMetadata);
row = dataSetMetadata.getRowMetadata();
assertNotNull(row);
column = row.getById("0002");
final SemanticDomain jsoDomain = new SemanticDomain("JSO", "JSO label", 1.0F);
column.getSemanticDomains().add(jsoDomain);
dataSetMetadataRepository.save(dataSetMetadata);
} finally {
lock.unlock();
}
assertThat(column.getDomain(), is("FIRST_NAME"));
assertThat(column.getDomainLabel(), is("First Name"));
assertThat(column.getDomainFrequency(), is(100.0F));
// when
final Response res = //
given().body(//
"{\"domain\": \"JSO\"}").when().contentType(//
JSON).post("/datasets/{dataSetId}/column/{columnId}", dataSetId, "0002");
// then
res.then().statusCode(200);
dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
assertNotNull(dataSetMetadata);
row = dataSetMetadata.getRowMetadata();
assertNotNull(row);
final ColumnMetadata actual = row.getById("0002");
assertThat(actual.getDomain(), is("JSO"));
assertThat(actual.getDomainLabel(), is("JSO label"));
assertThat(actual.getDomainFrequency(), is(1.0F));
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class SchemaAnalysisTest method testTDP_279.
/**
* See <a href="https://jira.talendforge.org/browse/TDP-279">https://jira.talendforge.org/browse/TDP-279</a>.
*
* @throws Exception
*/
@Test
public void testTDP_279() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../post_code.xls"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String[] expectedNames = { "zip" };
Type[] expectedTypes = { Type.INTEGER };
String[] expectedDomains = { "FR_POSTAL_CODE" };
int i = 0;
for (ColumnMetadata column : actual.getRowMetadata().getColumns()) {
assertThat(column.getName(), is(expectedNames[i]));
assertThat(column.getType(), is(expectedTypes[i].getName()));
assertThat(column.getDomain(), is(expectedDomains[i++]));
assertThat(column.getSemanticDomains()).isNotNull().isNotEmpty().hasSize(4).contains(//
new SemanticDomain("FR_POSTAL_CODE", "FR Postal Code", (float) 58.33), //
new SemanticDomain("FR_CODE_COMMUNE_INSEE", "FR Insee Code", (float) 58.33), //
new SemanticDomain("DE_POSTAL_CODE", "DE Postal Code", (float) 58.33), new SemanticDomain("US_POSTAL_CODE", "US Postal Code", (float) 58.33));
}
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class SchemaAnalysisTest method testTDP_471.
/**
* See <a href="https://jira.talendforge.org/browse/TDP-471">https://jira.talendforge.org/browse/TDP-471</a>.
*
* @throws Exception
*/
@Test
public void testTDP_471() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../semantic_type_threshold.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String[] expectedNames = { "gender_column" };
Type[] expectedTypes = { Type.INTEGER };
String[] expectedDomains = { "" };
int i = 0;
for (ColumnMetadata column : actual.getRowMetadata().getColumns()) {
assertThat(column.getName(), is(expectedNames[i]));
assertThat(column.getType(), is(expectedTypes[i].getName()));
assertThat(column.getDomain(), is(expectedDomains[i++]));
assertThat(column.getSemanticDomains()).isNotNull().isNotEmpty().hasSize(2).contains(//
new SemanticDomain("GENDER", "Gender", (float) 35), new SemanticDomain("CIVILITY", "Civility", (float) 20.833334));
}
}
use of org.talend.dataprep.api.dataset.statistics.SemanticDomain in project data-prep by Talend.
the class TransformationService method getSemanticDomains.
/**
* Return the semantic domains for the given parameters.
*
* @param metadata the dataset metadata.
* @param columnId the column id to analyze.
* @param records the dataset records.
* @return the semantic domains for the given parameters.
* @throws IOException can happen...
*/
private List<SemanticDomain> getSemanticDomains(DataSetMetadata metadata, String columnId, InputStream records) throws IOException {
// copy the column metadata and set the semantic domain forced flag to false to make sure the statistics adapter set all
// available domains
final ColumnMetadata columnMetadata = //
column().copy(//
metadata.getRowMetadata().getById(columnId)).semanticDomainForce(//
false).build();
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
try (final JsonParser parser = mapper.getFactory().createParser(new InputStreamReader(records, UTF_8))) {
final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser);
dataSet.getRecords().map(//
r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
}
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
return columnMetadata.getSemanticDomains();
}
Aggregations