use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class QualityAnalysisTest method TDP_1150_text_must_be_detected_if_even_if_integer_is_more_frequent.
/**
* This is not the perfect solution, but I cannot find better solution to do not display incoherent results. When
* date is detected during sampling and is not the most frequent type during full run and we have three different
* types we return String as type.
*
* See <a href="https://jira.talendforge.org/browse/TDP-224">https://jira.talendforge.org/browse/TDP-1150</a>.
*
* @throws Exception
*/
@Test
public void TDP_1150_text_must_be_detected_if_even_if_integer_is_more_frequent() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../valid_must_be_text_2.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String expectedName = "user_id";
Type expectedType = Type.INTEGER;
ColumnMetadata column = actual.getRowMetadata().getColumns().get(0);
assertThat(column.getName(), is(expectedName));
assertThat(column.getType(), is(expectedType.getName()));
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class SchemaAnalysisTest method testGenderAnalysis.
@Test
public void testGenderAnalysis() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../gender.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
// Gender must be a String with Gender domain
String[] expectedNames = { "name", "bounty", "gender" };
Type[] expectedTypes = { Type.STRING, Type.INTEGER, Type.STRING };
String[] expectedDomains = { "FIRST_NAME", "", "GENDER" };
int i = 0;
for (ColumnMetadata column : actual.getRowMetadata().getColumns()) {
assertThat(column.getName(), is(expectedNames[i]));
assertThat(column.getType(), is(expectedTypes[i].getName()));
assertThat(column.getDomain(), is(expectedDomains[i]));
i++;
}
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class DataSetMetadataRepositoryTestUtils method ensureThatOnlyCompatibleDataSetsAreReturned.
public static void ensureThatOnlyCompatibleDataSetsAreReturned(DataSetMetadataRepository repository, DataSetMetadataBuilder builder) {
// given
final DataSetMetadata metadata1 = //
builder.metadata().id("0001").row(column().type(Type.STRING).name("first"), //
column().type(Type.STRING).name("last")).build();
final DataSetMetadata metadata2 = //
builder.metadata().id("0002").row(column().type(Type.STRING).name("last"), //
column().type(Type.STRING).name("first")).build();
final DataSetMetadata metadata3 = //
builder.metadata().id("0003").row(column().type(Type.STRING).name("first"), //
column().type(Type.INTEGER).name("last")).build();
List<DataSetMetadata> metadatas = Arrays.asList(metadata1, metadata2, metadata3);
// retrieve set of data sets which are different from metadata1 but with similar schema
List<DataSetMetadata> expected = metadatas.stream().filter(m -> (!metadata1.equals(m) && metadata1.compatible(m))).sorted((m1, m2) -> m1.getId().compareTo(m2.getId())).collect(Collectors.toList());
// when
metadatas.stream().forEach(m -> repository.save(m));
Iterable<DataSetMetadata> iterable = repository.listCompatible(metadata1.getId());
List<DataSetMetadata> actual = StreamSupport.stream(iterable.spliterator(), false).sorted((m1, m2) -> m1.getId().compareTo(m2.getId())).collect(Collectors.toList());
// then
assertEquals(expected, actual);
}
use of org.talend.dataprep.api.type.Type in project data-prep by Talend.
the class MaskDataByDomain method compile.
@Override
public void compile(ActionContext actionContext) {
super.compile(actionContext);
if (ActionsUtils.doesCreateNewColumn(actionContext.getParameters(), CREATE_NEW_COLUMN_DEFAULT)) {
ActionsUtils.createNewColumn(actionContext, singletonList(ActionsUtils.additionalColumn()));
}
if (actionContext.getActionStatus() == OK) {
final RowMetadata rowMetadata = actionContext.getRowMetadata();
final String columnId = actionContext.getColumnId();
final ColumnMetadata column = rowMetadata.getById(columnId);
final String domain = column.getDomain();
final Type type = get(column.getType());
LOGGER.trace(">>> type: " + type + " metadata: " + column);
try {
if (DATE.equals(type)) {
final List<PatternFrequency> patternFreqList = column.getStatistics().getPatternFrequencies();
final List<String> dateTimePatternList = //
patternFreqList.stream().map(//
PatternFrequency::getPattern).collect(toList());
actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName(), dateTimePatternList));
} else {
actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName()));
}
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
actionContext.setActionStatus(CANCELED);
}
}
}
Aggregations