use of org.talend.dataprep.transformation.actions.date.DateParser in project data-prep by Talend.
the class SimpleFilterServiceTest method should_create_date_RANGE_predicate.
@Test
public void should_create_date_RANGE_predicate() throws Exception {
// given
final String filtersDefinition = "{" + " \"range\": {" + " \"field\": \"0001\"," + // 1970-01-01 UTC timezone
" \"start\": 0," + " \"end\": " + // 1990-01-01 UTC timezone
(LocalDateTime.of(1990, JANUARY, 1, 0, 0).toEpochSecond(UTC) * 1000) + " }" + "}";
final ColumnMetadata column = row.getRowMetadata().getById("0001");
column.setType("date");
final DateParser dateParser = Mockito.mock(DateParser.class);
when(dateParser.parse("a", column)).thenThrow(new DateTimeException(""));
when(dateParser.parse("1960-01-01", column)).thenReturn(LocalDateTime.of(1960, JANUARY, 1, 0, 0));
when(dateParser.parse("1970-01-01", column)).thenReturn(LocalDateTime.of(1970, JANUARY, 1, 0, 0));
when(dateParser.parse("1980-01-01", column)).thenReturn(LocalDateTime.of(1980, JANUARY, 1, 0, 0));
when(dateParser.parse("1990-01-01", column)).thenReturn(LocalDateTime.of(1990, JANUARY, 1, 0, 0));
when(dateParser.parse("2000-01-01", column)).thenReturn(LocalDateTime.of(2000, JANUARY, 1, 0, 0));
service.setDateParser(dateParser);
// when
final Predicate<DataSetRow> filter = service.build(filtersDefinition, rowMetadata);
// then
// invalid number
row.set("0001", "a");
assertThat(filter.test(row), is(false));
// lt min
row.set("0001", "1960-01-01");
assertThat(filter.test(row), is(false));
// eq min
row.set("0001", "1970-01-01");
assertThat(filter.test(row), is(true));
// in range
row.set("0001", "1980-01-01");
assertThat(filter.test(row), is(true));
// eq max
row.set("0001", "1990-01-01");
assertThat(filter.test(row), is(false));
// gt max
row.set("0001", "2000-01-01");
assertThat(filter.test(row), is(false));
}
use of org.talend.dataprep.transformation.actions.date.DateParser in project data-prep by Talend.
the class ProvidersTest method shouldCreateDateParser.
@Test
public void shouldCreateDateParser() throws Exception {
// when
final DateParser dateParser1 = Providers.get();
final DateParser dateParser2 = Providers.get(DateParser.class);
// then
assertTrue(dateParser1 == dateParser2);
}
use of org.talend.dataprep.transformation.actions.date.DateParser in project data-prep by Talend.
the class AnalyzerService method build.
/**
* Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
* all the wanted analysis settings for the analyzer.
*
* @param columns A list of columns, may be null or empty.
* @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
* once.
* @return A ready to use {@link Analyzer}.
*/
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
if (columns == null || columns.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Get all needed analysis
final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
for (Analysis setting : settings) {
if (setting != null) {
all.add(setting);
all.addAll(Arrays.asList(setting.dependencies));
}
}
if (all.isEmpty()) {
return Analyzers.with(NullAnalyzer.INSTANCE);
}
// Column types
DataTypeEnum[] types = TypeUtils.convert(columns);
// Semantic domains
List<String> domainList = //
columns.stream().map(//
ColumnMetadata::getDomain).map(//
d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
final String[] domains = domainList.toArray(new String[domainList.size()]);
DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
// Build all analyzers
List<Analyzer> analyzers = new ArrayList<>();
for (Analysis setting : settings) {
switch(setting) {
case SEMANTIC:
final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
semanticAnalyzer.setLimit(Integer.MAX_VALUE);
semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
analyzers.add(semanticAnalyzer);
break;
case HISTOGRAM:
analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
analyzers.add(new StreamNumberHistogramAnalyzer(types));
break;
case QUALITY:
final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
true));
break;
case CARDINALITY:
analyzers.add(new CardinalityAnalyzer());
break;
case PATTERNS:
analyzers.add(buildPatternAnalyzer(columns));
break;
case LENGTH:
analyzers.add(new TextLengthAnalyzer());
break;
case QUANTILES:
boolean acceptQuantiles = false;
for (DataTypeEnum type : types) {
if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
acceptQuantiles = true;
break;
}
}
if (acceptQuantiles) {
analyzers.add(new QuantileAnalyzer(types));
}
break;
case SUMMARY:
analyzers.add(new SummaryAnalyzer(types));
break;
case TYPE:
boolean shouldUseTypeAnalysis = true;
for (Analysis analysis : settings) {
if (analysis == Analysis.QUALITY) {
shouldUseTypeAnalysis = false;
break;
}
}
if (shouldUseTypeAnalysis) {
final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
} else {
LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
}
break;
case FREQUENCY:
analyzers.add(new DataTypeFrequencyAnalyzer());
break;
default:
throw new IllegalArgumentException("Missing support for '" + setting + "'.");
}
}
// Merge all analyzers into one
final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
analyzer.init();
if (LOGGER.isDebugEnabled()) {
// Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
return new ResourceMonitoredAnalyzer(analyzer);
} else {
return analyzer;
}
}
Aggregations