use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class StatisticsAdapter method injectSemanticTypes.
private void injectSemanticTypes(final ColumnMetadata column, final Analyzers.Result result) {
if (result.exist(SemanticType.class) && !column.isDomainForced()) {
final SemanticType semanticType = result.get(SemanticType.class);
final List<CategoryFrequency> suggestedTypes = semanticType.getSuggestedCategories();
// TDP-471: Don't pick semantic type if lower than a threshold.
final Optional<CategoryFrequency> bestMatch = //
suggestedTypes.stream().filter(//
e -> !e.getCategoryName().isEmpty()).findFirst();
if (bestMatch.isPresent()) {
// TODO (TDP-734) Take into account limit of the semantic analyzer.
final float score = bestMatch.get().getScore();
if (score > semanticThreshold) {
updateMetadataWithCategoryInfo(column, bestMatch.get());
} else {
// Ensure the domain is cleared if score is lower than threshold (earlier analysis - e.g.
// on the first 20 lines - may be over threshold, but full scan may decide otherwise.
resetDomain(column);
}
} else if (StringUtils.isNotEmpty(column.getDomain())) {
// Column *had* a domain but seems like new analysis removed it.
resetDomain(column);
}
// Keep all suggested semantic categories in the column metadata
List<SemanticDomain> semanticDomains = //
suggestedTypes.stream().map(//
this::toSemanticDomain).filter(//
semanticDomain -> semanticDomain != null && semanticDomain.getScore() >= 1).limit(//
10).collect(Collectors.toList());
column.setSemanticDomains(semanticDomains);
}
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class ChangeDatePatternTest method should_set_new_pattern_as_most_used_one_newcolumn.
@Test
public void should_set_new_pattern_as_most_used_one_newcolumn() throws Exception {
// given
final DataSetRow row = //
builder().with(//
value("toto").type(Type.STRING).name("recipe")).with(//
value("04/25/1999").type(Type.DATE).name("recipe").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("tata").type(Type.STRING).name("last update")).build();
parameters.put(CREATE_NEW_COLUMN, "true");
// when
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
// then
final List<PatternFrequency> patternFrequencies = //
row.getRowMetadata().getById(//
"0003").getStatistics().getPatternFrequencies();
String newPattern = parameters.get("new_pattern");
final Optional<PatternFrequency> newPatternSet = //
patternFrequencies.stream().filter(//
p -> StringUtils.equals(newPattern, p.getPattern())).findFirst();
assertTrue(newPatternSet.isPresent());
assertEquals(newPatternSet.get().getOccurrences(), 48);
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class ChangeDatePatternTest method should_set_new_pattern_as_most_used_one.
@Test
public void should_set_new_pattern_as_most_used_one() throws Exception {
// given
final DataSetRow row = //
builder().with(//
value("toto").type(Type.STRING).name("tips")).with(//
value("04/25/1999").type(Type.DATE).name("date").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("tata").type(Type.STRING).name("test")).build();
// when
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
// then
final List<PatternFrequency> patternFrequencies = //
row.getRowMetadata().getById(//
"0001").getStatistics().getPatternFrequencies();
String newPattern = parameters.get("new_pattern");
final Optional<PatternFrequency> newPatternSet = //
patternFrequencies.stream().filter(//
p -> StringUtils.equals(newPattern, p.getPattern())).findFirst();
assertTrue(newPatternSet.isPresent());
assertEquals(newPatternSet.get().getOccurrences(), 48);
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class TypeChangeTest method should_not_accept_any_type_to_avoid_transformation_to_be_in_transfo_list.
@Test
public void should_not_accept_any_type_to_avoid_transformation_to_be_in_transfo_list() {
// given
final DomainChange domainChange = new DomainChange();
for (final Type type : Type.values()) {
final ColumnMetadata columnMetadata = //
ColumnMetadata.Builder.column().type(//
type).computedId(//
"0002").domain(//
"FR_BEER").domainFrequency(//
1).domainLabel(//
"French Beer").build();
// when
final boolean accepted = domainChange.acceptField(columnMetadata);
// then
assertThat(accepted).isTrue();
}
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class QualityAnalysisTest method TDP_1150_string_must_be_detected_as_so_if_even_if_subtype_is_integer.
/**
* This test ensures that string is detected as type even if we use the sub type (integer) of the most frequent type
* (String) to detect invalids.
*
* See <a href="https://jira.talendforge.org/browse/TDP-224">https://jira.talendforge.org/browse/TDP-1150</a>.
*
* @throws Exception
*/
@Test
public void TDP_1150_string_must_be_detected_as_so_if_even_if_subtype_is_integer() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../valid_must_be_text1.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String expectedName = "user_id";
Type expectedType = Type.STRING;
ColumnMetadata column = actual.getRowMetadata().getColumns().get(0);
assertThat(column.getName(), is(expectedName));
assertThat(column.getType(), is(expectedType.getName()));
}
Aggregations