use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class StringRules method computeScoreForProperCaseAction.
/**
* Compute score for case actions.
*
* @param columnMetadata the column metadata to analyze.
* @return the score for case actions.
*/
private static Integer computeScoreForProperCaseAction(ColumnMetadata columnMetadata) {
final List<PatternFrequency> patterns = columnMetadata.getStatistics().getPatternFrequencies();
for (PatternFrequency pattern : patterns) {
final String patternAsString = pattern.getPattern();
// split words
StringTokenizer tokenizer = new StringTokenizer(patternAsString, " ");
while (tokenizer.hasMoreTokens()) {
final String token = tokenizer.nextToken();
if (!token.isEmpty()) {
// First character of word is not proper case, Proper Case should be suggested.
if (token.charAt(0) != 'A') {
return LOW;
}
// suggested.
for (int i = 1; i < token.length(); i++) {
if (token.charAt(i) != 'a') {
return LOW;
}
}
}
}
}
return NEGATIVE;
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class StatisticsAnalysisTest method testTDP_402.
/**
* See <a href="https://jira.talendforge.org/browse/TDP-402">https://jira.talendforge.org/browse/TDP-402</a>.
*
* @throws Exception
*/
@Test
public void testTDP_402() throws Exception {
final DataSetMetadata metadata = initializeDataSetMetadata(this.getClass().getResourceAsStream("dataset.csv"));
final ColumnMetadata dateOfBirth = metadata.getRowMetadata().getById("0004");
assertThat(dateOfBirth.getName(), is("date-of-birth"));
assertThat(dateOfBirth.getType(), is("date"));
final List<PatternFrequency> patternFrequencies = dateOfBirth.getStatistics().getPatternFrequencies();
final List<String> patterns = patternFrequencies.stream().map(pf -> pf.getPattern()).collect(Collectors.toList());
assertThat(patterns.size(), is(5));
assertTrue(patterns.contains("MM/dd/yyyy"));
assertTrue(patterns.contains("dd/MM/yyyy"));
assertTrue(patterns.contains("aaaaa"));
assertTrue(patterns.contains("yyyy-MM-dd"));
assertTrue(patterns.contains("yyyy-M-d"));
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class IntegerRuleTest method setUp.
@Before
public void setUp() throws Exception {
integerRule = IntegerRules.integerRule();
mostIntColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("9.9", 10));
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class LowerCaseRuleTest method setUp.
@Before
public void setUp() throws Exception {
lowerCaseRule = StringRules.lowerCaseRule();
stringColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("Aaaaa", 10));
stringLowerCaseColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("aaaa", 10));
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class ProperCaseRuleTest method setUp.
@Before
public void setUp() throws Exception {
trailingSpaceRule = StringRules.properCaseRule();
stringColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency("AaaA", 10));
stringWithProperCaseColumn.getStatistics().getPatternFrequencies().add(new PatternFrequency(" Aaa Aa A", 10));
}
Aggregations