Search in sources :

Example 1 with DateParser

use of org.talend.dataprep.transformation.actions.date.DateParser in project data-prep by Talend.

the class SimpleFilterServiceTest method should_create_date_RANGE_predicate.

@Test
public void should_create_date_RANGE_predicate() throws Exception {
    // given
    final String filtersDefinition = "{" + "   \"range\": {" + "       \"field\": \"0001\"," + // 1970-01-01 UTC timezone
    "       \"start\": 0," + "       \"end\": " + // 1990-01-01 UTC timezone
    (LocalDateTime.of(1990, JANUARY, 1, 0, 0).toEpochSecond(UTC) * 1000) + "   }" + "}";
    final ColumnMetadata column = row.getRowMetadata().getById("0001");
    column.setType("date");
    final DateParser dateParser = Mockito.mock(DateParser.class);
    when(dateParser.parse("a", column)).thenThrow(new DateTimeException(""));
    when(dateParser.parse("1960-01-01", column)).thenReturn(LocalDateTime.of(1960, JANUARY, 1, 0, 0));
    when(dateParser.parse("1970-01-01", column)).thenReturn(LocalDateTime.of(1970, JANUARY, 1, 0, 0));
    when(dateParser.parse("1980-01-01", column)).thenReturn(LocalDateTime.of(1980, JANUARY, 1, 0, 0));
    when(dateParser.parse("1990-01-01", column)).thenReturn(LocalDateTime.of(1990, JANUARY, 1, 0, 0));
    when(dateParser.parse("2000-01-01", column)).thenReturn(LocalDateTime.of(2000, JANUARY, 1, 0, 0));
    service.setDateParser(dateParser);
    // when
    final Predicate<DataSetRow> filter = service.build(filtersDefinition, rowMetadata);
    // then
    // invalid number
    row.set("0001", "a");
    assertThat(filter.test(row), is(false));
    // lt min
    row.set("0001", "1960-01-01");
    assertThat(filter.test(row), is(false));
    // eq min
    row.set("0001", "1970-01-01");
    assertThat(filter.test(row), is(true));
    // in range
    row.set("0001", "1980-01-01");
    assertThat(filter.test(row), is(true));
    // eq max
    row.set("0001", "1990-01-01");
    assertThat(filter.test(row), is(false));
    // gt max
    row.set("0001", "2000-01-01");
    assertThat(filter.test(row), is(false));
}
Also used : DateParser(org.talend.dataprep.transformation.actions.date.DateParser) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) DateTimeException(java.time.DateTimeException) DataSetRow(org.talend.dataprep.api.dataset.row.DataSetRow) Test(org.junit.Test)

Example 2 with DateParser

use of org.talend.dataprep.transformation.actions.date.DateParser in project data-prep by Talend.

the class ProvidersTest method shouldCreateDateParser.

@Test
public void shouldCreateDateParser() throws Exception {
    // when
    final DateParser dateParser1 = Providers.get();
    final DateParser dateParser2 = Providers.get(DateParser.class);
    // then
    assertTrue(dateParser1 == dateParser2);
}
Also used : DateParser(org.talend.dataprep.transformation.actions.date.DateParser) Test(org.junit.Test)

Example 3 with DateParser

use of org.talend.dataprep.transformation.actions.date.DateParser in project data-prep by Talend.

the class AnalyzerService method build.

/**
 * Build a {@link Analyzer} to analyze records with columns (in <code>columns</code>). <code>settings</code> give
 * all the wanted analysis settings for the analyzer.
 *
 * @param columns  A list of columns, may be null or empty.
 * @param settings A varargs with {@link Analysis}. Duplicates are possible in varargs but will be considered only
 *                 once.
 * @return A ready to use {@link Analyzer}.
 */
public Analyzer<Analyzers.Result> build(List<ColumnMetadata> columns, Analysis... settings) {
    if (columns == null || columns.isEmpty()) {
        return Analyzers.with(NullAnalyzer.INSTANCE);
    }
    // Get all needed analysis
    final Set<Analysis> all = EnumSet.noneOf(Analysis.class);
    for (Analysis setting : settings) {
        if (setting != null) {
            all.add(setting);
            all.addAll(Arrays.asList(setting.dependencies));
        }
    }
    if (all.isEmpty()) {
        return Analyzers.with(NullAnalyzer.INSTANCE);
    }
    // Column types
    DataTypeEnum[] types = TypeUtils.convert(columns);
    // Semantic domains
    List<String> domainList = // 
    columns.stream().map(// 
    ColumnMetadata::getDomain).map(// 
    d -> StringUtils.isBlank(d) ? SemanticCategoryEnum.UNKNOWN.getId() : d).collect(Collectors.toList());
    final String[] domains = domainList.toArray(new String[domainList.size()]);
    DictionarySnapshot dictionarySnapshot = dictionarySnapshotProvider.get();
    // Build all analyzers
    List<Analyzer> analyzers = new ArrayList<>();
    for (Analysis setting : settings) {
        switch(setting) {
            case SEMANTIC:
                final SemanticAnalyzer semanticAnalyzer = new SemanticAnalyzer(dictionarySnapshot);
                semanticAnalyzer.setLimit(Integer.MAX_VALUE);
                semanticAnalyzer.setMetadata(Metadata.HEADER_NAME, extractColumnNames(columns));
                analyzers.add(semanticAnalyzer);
                break;
            case HISTOGRAM:
                analyzers.add(new StreamDateHistogramAnalyzer(columns, types, dateParser));
                analyzers.add(new StreamNumberHistogramAnalyzer(types));
                break;
            case QUALITY:
                final DataTypeQualityAnalyzer dataTypeQualityAnalyzer = new DataTypeQualityAnalyzer(types);
                columns.forEach(c -> dataTypeQualityAnalyzer.addCustomDateTimePattern(RowMetadataUtils.getMostUsedDatePattern(c)));
                analyzers.add(new ValueQualityAnalyzer(dataTypeQualityAnalyzer, new SemanticQualityAnalyzer(dictionarySnapshot, domains, false), // NOSONAR
                true));
                break;
            case CARDINALITY:
                analyzers.add(new CardinalityAnalyzer());
                break;
            case PATTERNS:
                analyzers.add(buildPatternAnalyzer(columns));
                break;
            case LENGTH:
                analyzers.add(new TextLengthAnalyzer());
                break;
            case QUANTILES:
                boolean acceptQuantiles = false;
                for (DataTypeEnum type : types) {
                    if (type == DataTypeEnum.INTEGER || type == DataTypeEnum.DOUBLE) {
                        acceptQuantiles = true;
                        break;
                    }
                }
                if (acceptQuantiles) {
                    analyzers.add(new QuantileAnalyzer(types));
                }
                break;
            case SUMMARY:
                analyzers.add(new SummaryAnalyzer(types));
                break;
            case TYPE:
                boolean shouldUseTypeAnalysis = true;
                for (Analysis analysis : settings) {
                    if (analysis == Analysis.QUALITY) {
                        shouldUseTypeAnalysis = false;
                        break;
                    }
                }
                if (shouldUseTypeAnalysis) {
                    final List<String> mostUsedDatePatterns = getMostUsedDatePatterns(columns);
                    analyzers.add(new DataTypeAnalyzer(mostUsedDatePatterns));
                } else {
                    LOGGER.warn("Disabled {} analysis (conflicts with {}).", setting, Analysis.QUALITY);
                }
                break;
            case FREQUENCY:
                analyzers.add(new DataTypeFrequencyAnalyzer());
                break;
            default:
                throw new IllegalArgumentException("Missing support for '" + setting + "'.");
        }
    }
    // Merge all analyzers into one
    final Analyzer<Analyzers.Result> analyzer = Analyzers.with(analyzers.toArray(new Analyzer[analyzers.size()]));
    analyzer.init();
    if (LOGGER.isDebugEnabled()) {
        // Wrap analyzer for usage monitoring (to diagnose non-closed analyzer issues).
        return new ResourceMonitoredAnalyzer(analyzer);
    } else {
        return analyzer;
    }
}
Also used : Analyzers(org.talend.dataquality.common.inference.Analyzers) java.util(java.util) StringUtils(org.apache.commons.lang.StringUtils) CardinalityStatistics(org.talend.dataquality.statistics.cardinality.CardinalityStatistics) TypeUtils(org.talend.dataprep.api.type.TypeUtils) Metadata(org.talend.dataquality.common.inference.Metadata) DateParser(org.talend.dataprep.transformation.actions.date.DateParser) DataTypeFrequencyStatistics(org.talend.dataquality.statistics.frequency.DataTypeFrequencyStatistics) LoggerFactory(org.slf4j.LoggerFactory) SemanticCategoryEnum(org.talend.dataquality.semantic.classifier.SemanticCategoryEnum) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) DateTimePatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.DateTimePatternRecognizer) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) ValueQualityStatistics(org.talend.dataquality.common.inference.ValueQualityStatistics) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) SummaryStatistics(org.talend.dataquality.statistics.numeric.summary.SummaryStatistics) AbstractFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) EmptyPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.EmptyPatternRecognizer) SemanticType(org.talend.dataquality.semantic.statistics.SemanticType) PrintWriter(java.io.PrintWriter) DictionarySnapshotProvider(org.talend.dataquality.semantic.snapshot.DictionarySnapshotProvider) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) Logger(org.slf4j.Logger) LatinExtendedCharPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.LatinExtendedCharPatternRecognizer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) StringWriter(java.io.StringWriter) RowMetadataUtils(org.talend.dataprep.api.dataset.row.RowMetadataUtils) StreamDateHistogramStatistics(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics) StandardDictionarySnapshotProvider(org.talend.dataquality.semantic.snapshot.StandardDictionarySnapshotProvider) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) NullAnalyzer(org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer) DictionarySnapshot(org.talend.dataquality.semantic.snapshot.DictionarySnapshot) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) TextLengthStatistics(org.talend.dataquality.statistics.text.TextLengthStatistics) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer) Collectors(java.util.stream.Collectors) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) PatternFrequencyStatistics(org.talend.dataquality.statistics.frequency.pattern.PatternFrequencyStatistics) Analyzer(org.talend.dataquality.common.inference.Analyzer) QuantileStatistics(org.talend.dataquality.statistics.numeric.quantile.QuantileStatistics) AbstractPatternRecognizer(org.talend.dataquality.statistics.frequency.recognition.AbstractPatternRecognizer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) CompositePatternFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer) DataTypeOccurences(org.talend.dataquality.statistics.type.DataTypeOccurences) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) AbstractFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.AbstractFrequencyAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) NullAnalyzer(org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) Analyzer(org.talend.dataquality.common.inference.Analyzer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) CompositePatternFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.pattern.CompositePatternFrequencyAnalyzer) DataTypeQualityAnalyzer(org.talend.dataquality.statistics.quality.DataTypeQualityAnalyzer) DataTypeEnum(org.talend.dataquality.statistics.type.DataTypeEnum) StreamNumberHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer) SummaryAnalyzer(org.talend.dataquality.statistics.numeric.summary.SummaryAnalyzer) CardinalityAnalyzer(org.talend.dataquality.statistics.cardinality.CardinalityAnalyzer) DataTypeFrequencyAnalyzer(org.talend.dataquality.statistics.frequency.DataTypeFrequencyAnalyzer) TextLengthAnalyzer(org.talend.dataquality.statistics.text.TextLengthAnalyzer) SemanticAnalyzer(org.talend.dataquality.semantic.statistics.SemanticAnalyzer) QuantileAnalyzer(org.talend.dataquality.statistics.numeric.quantile.QuantileAnalyzer) ValueQualityAnalyzer(org.talend.dataquality.statistics.quality.ValueQualityAnalyzer) SemanticQualityAnalyzer(org.talend.dataquality.semantic.statistics.SemanticQualityAnalyzer) StreamDateHistogramAnalyzer(org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer) DictionarySnapshot(org.talend.dataquality.semantic.snapshot.DictionarySnapshot) DataTypeAnalyzer(org.talend.dataquality.statistics.type.DataTypeAnalyzer)

Aggregations

DateParser (org.talend.dataprep.transformation.actions.date.DateParser)3 Test (org.junit.Test)2 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)2 PrintWriter (java.io.PrintWriter)1 StringWriter (java.io.StringWriter)1 DateTimeException (java.time.DateTimeException)1 java.util (java.util)1 Collectors (java.util.stream.Collectors)1 StringUtils (org.apache.commons.lang.StringUtils)1 Logger (org.slf4j.Logger)1 LoggerFactory (org.slf4j.LoggerFactory)1 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)1 RowMetadataUtils (org.talend.dataprep.api.dataset.row.RowMetadataUtils)1 StreamDateHistogramAnalyzer (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramAnalyzer)1 StreamDateHistogramStatistics (org.talend.dataprep.api.dataset.statistics.date.StreamDateHistogramStatistics)1 StreamNumberHistogramAnalyzer (org.talend.dataprep.api.dataset.statistics.number.StreamNumberHistogramAnalyzer)1 TypeUtils (org.talend.dataprep.api.type.TypeUtils)1 NullAnalyzer (org.talend.dataprep.transformation.api.transformer.json.NullAnalyzer)1 Analyzer (org.talend.dataquality.common.inference.Analyzer)1 Analyzers (org.talend.dataquality.common.inference.Analyzers)1