Search in sources :

Example 11 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class StringRules method computeScoreForProperCaseAction.

/**
 * Compute score for case actions.
 *
 * @param columnMetadata the column metadata to analyze.
 * @return the score for case actions.
 */
private static Integer computeScoreForProperCaseAction(ColumnMetadata columnMetadata) {
    final List<PatternFrequency> patterns = columnMetadata.getStatistics().getPatternFrequencies();
    for (PatternFrequency pattern : patterns) {
        final String patternAsString = pattern.getPattern();
        // split words
        StringTokenizer tokenizer = new StringTokenizer(patternAsString, " ");
        while (tokenizer.hasMoreTokens()) {
            final String token = tokenizer.nextToken();
            if (!token.isEmpty()) {
                // First character of word is not proper case, Proper Case should be suggested.
                if (token.charAt(0) != 'A') {
                    return LOW;
                }
                // suggested.
                for (int i = 1; i < token.length(); i++) {
                    if (token.charAt(i) != 'a') {
                        return LOW;
                    }
                }
            }
        }
    }
    return NEGATIVE;
}
Also used : StringTokenizer(java.util.StringTokenizer) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency)

Example 12 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class StatisticsAnalysisTest method testTDP_402.

/**
 * See <a href="https://jira.talendforge.org/browse/TDP-402">https://jira.talendforge.org/browse/TDP-402</a>.
 *
 * @throws Exception
 */
@Test
public void testTDP_402() throws Exception {
    final DataSetMetadata metadata = initializeDataSetMetadata(this.getClass().getResourceAsStream("dataset.csv"));
    final ColumnMetadata dateOfBirth = metadata.getRowMetadata().getById("0004");
    assertThat(dateOfBirth.getName(), is("date-of-birth"));
    assertThat(dateOfBirth.getType(), is("date"));
    final List<PatternFrequency> patternFrequencies = dateOfBirth.getStatistics().getPatternFrequencies();
    final List<String> patterns = patternFrequencies.stream().map(pf -> pf.getPattern()).collect(Collectors.toList());
    assertThat(patterns.size(), is(5));
    assertTrue(patterns.contains("MM/dd/yyyy"));
    assertTrue(patterns.contains("dd/MM/yyyy"));
    assertTrue(patterns.contains("aaaaa"));
    assertTrue(patterns.contains("yyyy-MM-dd"));
    assertTrue(patterns.contains("yyyy-M-d"));
}
Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) DatasetImportedEvent(org.talend.dataprep.dataset.event.DatasetImportedEvent) Autowired(org.springframework.beans.factory.annotation.Autowired) Random(java.util.Random) Test(org.junit.Test) UUID(java.util.UUID) FormatAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.FormatAnalysis) Collectors(java.util.stream.Collectors) ContentAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.ContentAnalysis) SchemaAnalysis(org.talend.dataprep.dataset.service.analysis.synchronous.SchemaAnalysis) List(java.util.List) DataSetBaseTest(org.talend.dataprep.dataset.DataSetBaseTest) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) Assert(org.junit.Assert) InputStream(java.io.InputStream) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) Test(org.junit.Test) DataSetBaseTest(org.talend.dataprep.dataset.DataSetBaseTest)

Example 13 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class IntegerRuleTest method setUp.

@Before
public void setUp() throws Exception {
    integerRule = IntegerRules.integerRule();
    mostIntColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("9.9", 10));
}
Also used : PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) Before(org.junit.Before)

Example 14 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class LowerCaseRuleTest method setUp.

@Before
public void setUp() throws Exception {
    lowerCaseRule = StringRules.lowerCaseRule();
    stringColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("Aaaaa", 10));
    stringLowerCaseColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("aaaa", 10));
}
Also used : PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) Before(org.junit.Before)

Example 15 with PatternFrequency

use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.

the class ProperCaseRuleTest method setUp.

@Before
public void setUp() throws Exception {
    trailingSpaceRule = StringRules.properCaseRule();
    stringColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("AaaA", 10));
    stringWithProperCaseColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency(" Aaa Aa A", 10));
}
Also used : PatternFrequency(org.talend.dataprep.api.dataset.statistics.PatternFrequency) Before(org.junit.Before)

Aggregations

PatternFrequency (org.talend.dataprep.api.dataset.statistics.PatternFrequency)25 Test (org.junit.Test)14 DataSetRow (org.talend.dataprep.api.dataset.row.DataSetRow)11 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)8 Before (org.junit.Before)7 RowMetadata (org.talend.dataprep.api.dataset.RowMetadata)4 HashMap (java.util.HashMap)3 List (java.util.List)3 StringUtils (org.apache.commons.lang.StringUtils)3 CoreMatchers.is (org.hamcrest.CoreMatchers.is)3 Type (org.talend.dataprep.api.type.Type)3 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 Arrays (java.util.Arrays)2 Collections (java.util.Collections)2 Locale (java.util.Locale)2 Map (java.util.Map)2 Optional (java.util.Optional)2 Assert.assertEquals (org.junit.Assert.assertEquals)2 Assert.assertFalse (org.junit.Assert.assertFalse)2