use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class DateParserTest method getPatterns_should_remove_invalid_or_empty_then_sort_patterns.
@Test
public void getPatterns_should_remove_invalid_or_empty_then_sort_patterns() throws IOException {
// given
final DataSetRow row = ActionMetadataTestUtils.getRow("toto", "04/25/1999", "tata");
// contains valid, invalid, empty patterns
ActionMetadataTestUtils.setStatistics(row, "0001", getDateTestJsonAsStream("statistics_with_different_test_cases.json"));
final List<PatternFrequency> patternFrequencies = row.getRowMetadata().getById("0001").getStatistics().getPatternFrequencies();
// when
final List<DatePattern> actual = action.getPatterns(patternFrequencies);
// then
final List<DatePattern> expected = new ArrayList<>();
expected.add(new DatePattern("MM/dd/yyyy", 47));
expected.add(new DatePattern("MM-dd-yy", 27));
expected.add(new DatePattern("yyyy", 0));
assertEquals(expected, actual);
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class DateParser method guessPattern.
/**
* Guess the pattern from the given value.
*
* @param value the value to get the date time from.
* @param column the column metadata
* @return the wanted parsed date time. For date only value, time is set to 00:00:00.
*/
DatePattern guessPattern(String value, ColumnMetadata column) {
if (StringUtils.isEmpty(value)) {
throw new DateTimeException("No pattern can be found out of '" + value + "'");
}
// call DQ on the given value
try (Analyzer<Analyzers.Result> analyzer = analyzerService.build(column, AnalyzerService.Analysis.PATTERNS)) {
analyzer.analyze(value);
analyzer.end();
// only one value --> only one result
final Analyzers.Result result = analyzer.getResult().get(0);
if (result.exist(PatternFrequencyStatistics.class)) {
final PatternFrequencyStatistics patternFrequencyStatistics = result.get(PatternFrequencyStatistics.class);
final Map<String, Long> topTerms = patternFrequencyStatistics.getTopK(1);
List<PatternFrequency> patterns = new ArrayList<>(1);
topTerms.forEach((s, o) -> patterns.add(new PatternFrequency(s, o)));
// get & check the results
final List<DatePattern> results = getPatterns(patterns);
if (results.isEmpty()) {
throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
}
// as Christopher L. said : "there can be only one" :-)
return getPatterns(patterns).get(0);
} else {
throw new DateTimeException("DQ did not find any pattern for '" + value + "'");
}
} catch (Exception e) {
throw new DateTimeException("Unable to close analyzer after analyzing value '" + value + "'", e);
}
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class DateParser method guessAndParse.
/**
* Try to guess the pattern from the value. If the date is successfully parsed, the column statistics is updated
* with the new pattern.
*
* @param value the date to parse.
* @param column the column.
* @return the parsed date.
* @throws DateTimeException if the date cannot be parsed.
*/
LocalDateTime guessAndParse(String value, ColumnMetadata column) {
final DatePattern guessedPattern = guessPattern(value, column);
LocalDateTime result = parseDateFromPatterns(value, Collections.singletonList(guessedPattern));
// update the column statistics to prevent future DQ calls
final List<PatternFrequency> patternFrequencies = column.getStatistics().getPatternFrequencies();
patternFrequencies.add(new PatternFrequency(guessedPattern.getPattern(), guessedPattern.getOccurrences()));
return result;
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class MaskDataByDomain method compile.
@Override
public void compile(ActionContext actionContext) {
super.compile(actionContext);
if (ActionsUtils.doesCreateNewColumn(actionContext.getParameters(), CREATE_NEW_COLUMN_DEFAULT)) {
ActionsUtils.createNewColumn(actionContext, singletonList(ActionsUtils.additionalColumn()));
}
if (actionContext.getActionStatus() == OK) {
final RowMetadata rowMetadata = actionContext.getRowMetadata();
final String columnId = actionContext.getColumnId();
final ColumnMetadata column = rowMetadata.getById(columnId);
final String domain = column.getDomain();
final Type type = get(column.getType());
LOGGER.trace(">>> type: " + type + " metadata: " + column);
try {
if (DATE.equals(type)) {
final List<PatternFrequency> patternFreqList = column.getStatistics().getPatternFrequencies();
final List<String> dateTimePatternList = //
patternFreqList.stream().map(//
PatternFrequency::getPattern).collect(toList());
actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName(), dateTimePatternList));
} else {
actionContext.get(MASKER, p -> new ValueDataMasker(domain, type.getName()));
}
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
actionContext.setActionStatus(CANCELED);
}
}
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class ChangeDatePattern method compile.
@Override
public void compile(ActionContext actionContext) {
super.compile(actionContext);
boolean doesCreateNewColumn = ActionsUtils.doesCreateNewColumn(actionContext.getParameters(), CREATE_NEW_COLUMN_DEFAULT);
if (doesCreateNewColumn) {
ActionsUtils.createNewColumn(actionContext, singletonList(ActionsUtils.additionalColumn().withName(actionContext.getColumnName() + NEW_COLUMN_SUFFIX).withCopyMetadataFromId(actionContext.getColumnId())));
}
if (actionContext.getActionStatus() == OK) {
compileDatePattern(actionContext);
if (actionContext.getActionStatus() == OK) {
// register the new pattern in column's stats as the most used pattern,
// to be able to process date action more efficiently later
final DatePattern newPattern = actionContext.get(COMPILED_DATE_PATTERN);
final RowMetadata rowMetadata = actionContext.getRowMetadata();
// target column
String targetId = ActionsUtils.getTargetColumnId(actionContext);
final ColumnMetadata targetColumn = rowMetadata.getById(targetId);
// origin column
final String columnId = actionContext.getColumnId();
final ColumnMetadata column = rowMetadata.getById(columnId);
// if the target column is not the original column, we souldn't use the same statitics object
final Statistics statistics;
if (doesCreateNewColumn) {
statistics = new Statistics(column.getStatistics());
targetColumn.setStatistics(statistics);
} else {
statistics = targetColumn.getStatistics();
}
actionContext.get(FROM_DATE_PATTERNS, p -> compileFromDatePattern(actionContext));
final PatternFrequency newPatternFrequency = statistics.getPatternFrequencies().stream().filter(patternFrequency -> StringUtils.equals(patternFrequency.getPattern(), newPattern.getPattern())).findFirst().orElseGet(() -> {
final PatternFrequency newPatternFreq = new PatternFrequency(newPattern.getPattern(), 0);
statistics.getPatternFrequencies().add(newPatternFreq);
return newPatternFreq;
});
long mostUsedPatternCount = getMostUsedPatternCount(column);
newPatternFrequency.setOccurrences(mostUsedPatternCount + 1);
rowMetadata.update(targetId, targetColumn);
}
}
}
Aggregations