Search in sources :

Example 21 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class QualityAnalysisTest method TDP_1150_text_must_be_detected_if_even_if_integer_is_more_frequent.

/**
 * This is not the perfect solution, but I cannot find better solution to do not display incoherent results. When
 * date is detected during sampling and is not the most frequent type during full run and we have three different
 * types we return String as type.
 *
 * See <a href="https://jira.talendforge.org/browse/TDP-224">https://jira.talendforge.org/browse/TDP-1150</a>.
 *
 * @throws Exception
 */
@Test
public void TDP_1150_text_must_be_detected_if_even_if_integer_is_more_frequent() {
    final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../valid_must_be_text_2.csv"));
    assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
    String expectedName = "user_id";
    Type expectedType = Type.INTEGER;
    ColumnMetadata column = actual.getRowMetadata().getColumns().get(0);
    assertThat(column.getName(), is(expectedName));
    assertThat(column.getType(), is(expectedType.getName()));
}
Also used : Type(org.talend.dataprep.api.type.Type) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DataSetServiceTest(org.talend.dataprep.dataset.service.DataSetServiceTest) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) Test(org.junit.Test) DataSetBaseTest(org.talend.dataprep.dataset.DataSetBaseTest) DataSetServiceTest(org.talend.dataprep.dataset.service.DataSetServiceTest)

Example 22 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class SchemaAnalysisTest method testGenderAnalysis.

@Test
public void testGenderAnalysis() {
    final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../gender.csv"));
    assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
    // Gender must be a String with Gender domain
    String[] expectedNames = { "name", "bounty", "gender" };
    Type[] expectedTypes = { Type.STRING, Type.INTEGER, Type.STRING };
    String[] expectedDomains = { "FIRST_NAME", "", "GENDER" };
    int i = 0;
    for (ColumnMetadata column : actual.getRowMetadata().getColumns()) {
        assertThat(column.getName(), is(expectedNames[i]));
        assertThat(column.getType(), is(expectedTypes[i].getName()));
        assertThat(column.getDomain(), is(expectedDomains[i]));
        i++;
    }
}
Also used : Type(org.talend.dataprep.api.type.Type) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DataSetServiceTest(org.talend.dataprep.dataset.service.DataSetServiceTest) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) Test(org.junit.Test) DataSetBaseTest(org.talend.dataprep.dataset.DataSetBaseTest) DataSetServiceTest(org.talend.dataprep.dataset.service.DataSetServiceTest)

Example 23 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class DataSetMetadataRepositoryTestUtils method ensureThatOnlyCompatibleDataSetsAreReturned.

public static void ensureThatOnlyCompatibleDataSetsAreReturned(DataSetMetadataRepository repository, DataSetMetadataBuilder builder) {
    // given
    final DataSetMetadata metadata1 = // 
    builder.metadata().id("0001").row(column().type(Type.STRING).name("first"), // 
    column().type(Type.STRING).name("last")).build();
    final DataSetMetadata metadata2 = // 
    builder.metadata().id("0002").row(column().type(Type.STRING).name("last"), // 
    column().type(Type.STRING).name("first")).build();
    final DataSetMetadata metadata3 = // 
    builder.metadata().id("0003").row(column().type(Type.STRING).name("first"), // 
    column().type(Type.INTEGER).name("last")).build();
    List<DataSetMetadata> metadatas = Arrays.asList(metadata1, metadata2, metadata3);
    // retrieve set of data sets which are different from metadata1 but with similar schema
    List<DataSetMetadata> expected = metadatas.stream().filter(m -> (!metadata1.equals(m) && metadata1.compatible(m))).sorted((m1, m2) -> m1.getId().compareTo(m2.getId())).collect(Collectors.toList());
    // when
    metadatas.stream().forEach(m -> repository.save(m));
    Iterable<DataSetMetadata> iterable = repository.listCompatible(metadata1.getId());
    List<DataSetMetadata> actual = StreamSupport.stream(iterable.spliterator(), false).sorted((m1, m2) -> m1.getId().compareTo(m2.getId())).collect(Collectors.toList());
    // then
    assertEquals(expected, actual);
}
Also used : Arrays(java.util.Arrays) List(java.util.List) Type(org.talend.dataprep.api.type.Type) DataSetMetadataBuilder(org.talend.dataprep.dataset.DataSetMetadataBuilder) Builder.column(org.talend.dataprep.api.dataset.ColumnMetadata.Builder.column) StreamSupport(java.util.stream.StreamSupport) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) Collectors(java.util.stream.Collectors) Assert.assertEquals(org.junit.Assert.assertEquals) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata)

Example 24 with Type

use of org.talend.dataprep.api.type.Type in project data-prep by Talend.

the class MaskDataByDomain method compile.

@Override
public void compile(ActionContext actionContext) {
    super.compile(actionContext);
    if (ActionsUtils.doesCreateNewColumn(actionContext.getParameters(), CREATE_NEW_COLUMN_DEFAULT)) {
        ActionsUtils.createNewColumn(actionContext, singletonList(ActionsUtils.additionalColumn()));
    }
    if (actionContext.getActionStatus() == OK) {
        final RowMetadata rowMetadata = actionContext.getRowMetadata();
        final String columnId = actionContext.getColumnId();
        final ColumnMetadata column = rowMetadata.getById(columnId);
        final String domain = column.getDomain();
        final Type type = get(column.getType());
        LOGGER.trace(">>> type: " + type + " metadata: " + column);
        try {
            if (DATE.equals(type)) {
                final List<PatternFrequency> patternFreqList = column.getStatistics().getPatternFrequencies();
                final List<String> dateTimePatternList = // 
                patternFreqList.stream().map(// 
                PatternFrequency::getPattern).collect(toList());
                actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName(), dateTimePatternList));
            } else {
                actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName()));
            }
        } catch (Exception e) {
            LOGGER.error(e.getMessage(), e);
            actionContext.setActionStatus(CANCELED);
        }
    }
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) Type(org.talend.dataprep.api.type.Type) ValueDataMasker(org.talend.dataquality.semantic.datamasking.ValueDataMasker) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata)

Aggregations

Type (org.talend.dataprep.api.type.Type)24 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)21 Test (org.junit.Test)17 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)14 DataSetBaseTest (org.talend.dataprep.dataset.DataSetBaseTest)13 DataSetServiceTest (org.talend.dataprep.dataset.service.DataSetServiceTest)12 Arrays (java.util.Arrays)4 List (java.util.List)4 Optional (java.util.Optional)3 StringUtils (org.apache.commons.lang.StringUtils)3 Assert.assertEquals (org.junit.Assert.assertEquals)3 Builder.column (org.talend.dataprep.api.dataset.ColumnMetadata.Builder.column)3 RowMetadata (org.talend.dataprep.api.dataset.RowMetadata)3 PatternFrequency (org.talend.dataprep.api.dataset.statistics.PatternFrequency)3 IOException (java.io.IOException)2 Collections (java.util.Collections)2 HashMap (java.util.HashMap)2 Locale (java.util.Locale)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2