Search in sources :

Example 21 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class DateParserTest method getPatterns_should_remove_invalid_or_empty_then_sort_patterns.

@Test
public void getPatterns_should_remove_invalid_or_empty_then_sort_patterns() throws IOException {
    // given
    final DataSetRow row = ActionMetadataTestUtils.getRow("toto", "04/25/1999", "tata");
    // contains valid, invalid, empty patterns
    ActionMetadataTestUtils.setStatistics(row, "0001", getDateTestJsonAsStream("statistics_with_different_test_cases.json"));
    final List<PatternFrequency> patternFrequencies = row.getRowMetadata().getById("0001").getStatistics().getPatternFrequencies();
    // when
    final List<DatePattern> actual = action.getPatterns(patternFrequencies);
    // then
    final List<DatePattern> expected = new ArrayList<>();
    expected.add(new DatePattern("MM/dd/yyyy", 47));
    expected.add(new DatePattern("MM-dd-yy", 27));
    expected.add(new DatePattern("yyyy", 0));
    assertEquals(expected, actual);
}
Also used : PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) ArrayList(java.util.ArrayList) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) Test(org.junit.Test)

Example 22 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class DateParser method guessPattern.

/**
 * Guess the pattern from the given value.
 *
 * @param value the value to get the date time from.
 * @param column the column metadata
 * @return the wanted parsed date time. For date only value, time is set to 00:00:00.
 */
DatePattern guessPattern(String value, ColumnMetadata column) {
    if (StringUtils.isEmpty(value)) {
        throw new DateTimeException("No pattern can be found out of '" + value + "'");
    }
    // call DQ on the given value
    try (Analyzer<Analyzers.Result> analyzer = analyzerService.build(column, AnalyzerService.Analysis.PATTERNS)) {
        analyzer.analyze(value);
        analyzer.end();
        // only one value --> only one result
        final Analyzers.Result result = analyzer.getResult().get(0);
        if (result.exist(PatternFrequencyStatistics.class)) {
            final PatternFrequencyStatistics patternFrequencyStatistics = result.get(PatternFrequencyStatistics.class);
            final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(1);
            List<PatternFrequency> patterns = new ArrayList<>(1);
            topTerms.forEach((s, o) -> patterns.add(new PatternFrequency(s, o)));
            // get & check the results
            final List<DatePattern> results = getPatterns(patterns);
            if (results.isEmpty()) {
                throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
            }
            // as Christopher L. said : "there can be only one" :-)
            return getPatterns(patterns).get(0);
        } else {
            throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
        }
    } catch (Exception e) {
        throw new DateTimeException("Unable to close analyzer after analyzing value '" + value + "'", e);
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) DateTimeException(java.time.DateTimeException) DateTimeException(java.time.DateTimeException) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics)

Example 23 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class DateParser method guessAndParse.

/**
 * Try to guess the pattern from the value. If the date is successfully parsed, the column statistics is updated
 * with the new pattern.
 *
 * @param value the date to parse.
 * @param column the column.
 * @return the parsed date.
 * @throws DateTimeException if the date cannot be parsed.
 */
LocalDateTime guessAndParse(String value, ColumnMetadata column) {
    final DatePattern guessedPattern = guessPattern(value, column);
    LocalDateTime result = parseDateFromPatterns(value, Collections.singletonList(guessedPattern));
    // update the column statistics to prevent future DQ calls
    final List<PatternFrequency> patternFrequencies = column.getStatistics().getPatternFrequencies();
    patternFrequencies.add(new PatternFrequency(guessedPattern.getPattern(), guessedPattern.getOccurrences()));
    return result;
}
Also used : LocalDateTime(java.time.LocalDateTime) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency)

Example 24 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class MaskDataByDomain method compile.

@Override
public void compile(ActionContext actionContext) {
    super.compile(actionContext);
    if (ActionsUtils.doesCreateNewColumn(actionContext.getParameters(), CREATE_NEW_COLUMN_DEFAULT)) {
        ActionsUtils.createNewColumn(actionContext, singletonList(ActionsUtils.additionalColumn()));
    }
    if (actionContext.getActionStatus() == OK) {
        final RowMetadata rowMetadata = actionContext.getRowMetadata();
        final String columnId = actionContext.getColumnId();
        final ColumnMetadata column = rowMetadata.getById(columnId);
        final String domain = column.getDomain();
        final Type type = get(column.getType());
        LOGGER.trace(">>> type: " + type + " metadata: " + column);
        try {
            if (DATE.equals(type)) {
                final List<PatternFrequency> patternFreqList = column.getStatistics().getPatternFrequencies();
                final List<String> dateTimePatternList = // 
                patternFreqList.stream().map(// 
                PatternFrequency::getPattern).collect(toList());
                actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName(), dateTimePatternList));
            } else {
                actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName()));
            }
        } catch (Exception e) {
            LOGGER.error(e.getMessage(), e);
            actionContext.setActionStatus(CANCELED);
        }
    }
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) Type(org.talend.dataprep.api.type.Type) ValueDataMasker(org.talend.dataquality.semantic.datamasking.ValueDataMasker) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata)

Example 25 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class ChangeDatePattern method compile.

@Override
public void compile(ActionContext actionContext) {
    super.compile(actionContext);
    boolean doesCreateNewColumn = ActionsUtils.doesCreateNewColumn(actionContext.getParameters(), CREATE_NEW_COLUMN_DEFAULT);
    if (doesCreateNewColumn) {
        ActionsUtils.createNewColumn(actionContext, singletonList(ActionsUtils.additionalColumn().withName(actionContext.getColumnName() + NEW_COLUMN_SUFFIX).withCopyMetadataFromId(actionContext.getColumnId())));
    }
    if (actionContext.getActionStatus() == OK) {
        compileDatePattern(actionContext);
        if (actionContext.getActionStatus() == OK) {
            // register the new pattern in column's stats as the most used pattern,
            // to be able to process date action more efficiently later
            final DatePattern newPattern = actionContext.get(COMPILED_DATE_PATTERN);
            final RowMetadata rowMetadata = actionContext.getRowMetadata();
            // target column
            String targetId = ActionsUtils.getTargetColumnId(actionContext);
            final ColumnMetadata targetColumn = rowMetadata.getById(targetId);
            // origin column
            final String columnId = actionContext.getColumnId();
            final ColumnMetadata column = rowMetadata.getById(columnId);
            // if the target column is not the original column, we souldn't use the same statitics object
            final Statistics statistics;
            if (doesCreateNewColumn) {
                statistics = new Statistics(column.getStatistics());
                targetColumn.setStatistics(statistics);
            } else {
                statistics = targetColumn.getStatistics();
            }
            actionContext.get(FROM_DATE_PATTERNS, p -> compileFromDatePattern(actionContext));
            final PatternFrequency newPatternFrequency = statistics.getPatternFrequencies().stream().filter(patternFrequency -> StringUtils.equals(patternFrequency.getPattern(), newPattern.getPattern())).findFirst().orElseGet(() -> {
                final PatternFrequency newPatternFreq = new PatternFrequency(newPattern.getPattern(), 0);
                statistics.getPatternFrequencies().add(newPatternFreq);
                return newPatternFreq;
            });
            long mostUsedPatternCount = getMostUsedPatternCount(column);
            newPatternFrequency.setOccurrences(mostUsedPatternCount + 1);
            rowMetadata.update(targetId, targetColumn);
        }
    }
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) RowMetadata(org.talend.dataprep.api.dataset.RowMetadata) Statistics(org.talend.dataprep.api.dataset.statistics.Statistics)

Aggregations

PatternFrequency (org.talend.dataprep.api.dataset.statistics.PatternFrequency)25 Test (org.junit.Test)14 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)11 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)8 Before (org.junit.Before)7 RowMetadata (org.talend.dataprep.api.dataset.RowMetadata)4 HashMap (java.util.HashMap)3 List (java.util.List)3 StringUtils (org.apache.commons.lang.StringUtils)3 CoreMatchers.is (org.hamcrest.CoreMatchers.is)3 Type (org.talend.dataprep.api.type.Type)3 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 Arrays (java.util.Arrays)2 Collections (java.util.Collections)2 Locale (java.util.Locale)2 Map (java.util.Map)2 Optional (java.util.Optional)2 Assert.assertEquals (org.junit.Assert.assertEquals)2 Assert.assertFalse (org.junit.Assert.assertFalse)2