use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class ChangeDatePatternTest method should_set_new_pattern_as_most_used_one_newcolumn.
@Test
public void should_set_new_pattern_as_most_used_one_newcolumn() throws Exception {
// given
final DataSetRow row = //
builder().with(//
value("toto").type(Type.STRING).name("recipe")).with(//
value("04/25/1999").type(Type.DATE).name("recipe").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("tata").type(Type.STRING).name("last update")).build();
parameters.put(CREATE_NEW_COLUMN, "true");
// when
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
// then
final List<PatternFrequency> patternFrequencies = //
row.getRowMetadata().getById(//
"0003").getStatistics().getPatternFrequencies();
String newPattern = parameters.get("new_pattern");
final Optional<PatternFrequency> newPatternSet = //
patternFrequencies.stream().filter(//
p -> StringUtils.equals(newPattern, p.getPattern())).findFirst();
assertTrue(newPatternSet.isPresent());
assertEquals(newPatternSet.get().getOccurrences(), 48);
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class ChangeDatePatternTest method should_set_new_pattern_as_most_used_one.
@Test
public void should_set_new_pattern_as_most_used_one() throws Exception {
// given
final DataSetRow row = //
builder().with(//
value("toto").type(Type.STRING).name("tips")).with(//
value("04/25/1999").type(Type.DATE).name("date").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("tata").type(Type.STRING).name("test")).build();
// when
ActionTestWorkbench.test(row, actionRegistry, factory.create(action, parameters));
// then
final List<PatternFrequency> patternFrequencies = //
row.getRowMetadata().getById(//
"0001").getStatistics().getPatternFrequencies();
String newPattern = parameters.get("new_pattern");
final Optional<PatternFrequency> newPatternSet = //
patternFrequencies.stream().filter(//
p -> StringUtils.equals(newPattern, p.getPattern())).findFirst();
assertTrue(newPatternSet.isPresent());
assertEquals(newPatternSet.get().getOccurrences(), 48);
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class ChangeDatePatternTest method test_apply_in_newcolumn.
@Test
public void test_apply_in_newcolumn() throws Exception {
// given
final DataSetRow row1 = //
builder().with(//
value("toto").type(Type.STRING).name("recipe")).with(//
value("04/25/1999").type(Type.DATE).name("last update").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("tata").type(Type.STRING).name("recipe")).build();
final DataSetRow row2 = //
builder().with(//
value("tata").type(Type.STRING).name("recipe")).with(//
value("01/22/2018").type(Type.DATE).name("last update").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("toto").type(Type.STRING).name("recipe")).build();
final DataSetRow row3 = //
builder().with(//
value("tata").type(Type.STRING).name("recipe")).with(//
value("22/01/2018").type(Type.DATE).name("last update").statistics(getDateTestJsonAsStream("statistics_MM_dd_yyyy.json"))).with(//
value("toto").type(Type.STRING).name("recipe")).build();
parameters.put(CREATE_NEW_COLUMN, "true");
// then
assertEquals(7, row1.getRowMetadata().getColumns().get(1).getStatistics().getPatternFrequencies().size());
// when
ActionTestWorkbench.test(Arrays.asList(row1, row2, row3), actionRegistry, factory.create(action, parameters));
// then
final DataSetRow expectedRow1 = getRow("toto", "04/25/1999", "tata", "25 - Apr - 1999");
final DataSetRow expectedRow2 = getRow("tata", "01/22/2018", "toto", "22 - Jan - 2018");
final DataSetRow expectedRow3 = getRow("tata", "22/01/2018", "toto");
assertEquals(expectedRow1.values(), row1.values());
assertEquals(expectedRow2.values(), row2.values());
assertEquals(expectedRow3.values(), row3.values());
ColumnMetadata column1 = row1.getRowMetadata().getColumns().get(1);
ColumnMetadata column2 = row1.getRowMetadata().getColumns().get(2);
List<PatternFrequency> listPatternFirstColumn = column1.getStatistics().getPatternFrequencies();
List<PatternFrequency> listPatternSecondColumn = column2.getStatistics().getPatternFrequencies();
// check that the stats on the from column are not changed
assertEquals(7, listPatternFirstColumn.size());
assertEquals("MM/dd/yyyy", listPatternSecondColumn.get(0).getPattern());
// check that the stats on the target column are changed, and the new target pattern is added to the known ones
assertEquals(8, listPatternSecondColumn.size());
assertEquals("dd - MMM - yyyy", listPatternSecondColumn.get(7).getPattern());
// the new added pattern should had the biggest frequency : so it is the old most used pattern count + 1
assertEquals(listPatternSecondColumn.get(7).getOccurrences(), listPatternSecondColumn.get(0).getOccurrences() + 1);
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class PreparationAPITest method testPreparationPreviewOnPreparationWithTrimAction_TDP_5057.
/**
* Verify a calculate time since preview after a trim step on a preparation
* see <a href="https://jira.talendforge.org/browse/TDP-5057">TDP-5057</a>
*/
@Test
public void testPreparationPreviewOnPreparationWithTrimAction_TDP_5057() throws IOException {
// Create a dataset from csv
final String datasetId = testClient.createDataset("preview/best_sad_songs_of_all_time.csv", "testPreview");
// Create a preparation
String preparationId = testClient.createPreparationFromDataset(datasetId, "testPrep", home.getId());
// apply trim action on the 8nd column to make this column date valid
Map<String, String> trimParameters = new HashMap<>();
trimParameters.put("create_new_column", "false");
trimParameters.put("padding_character", "whitespace");
trimParameters.put("scope", "column");
trimParameters.put("column_id", "0008");
trimParameters.put("column_name", "Added At");
trimParameters.put("row_id", "null");
testClient.applyAction(preparationId, Trim.TRIM_ACTION_NAME, trimParameters);
// check column is date valid after trim action
InputStream inputStream = testClient.getPreparation(preparationId).asInputStream();
mapper.getDeserializationConfig().without(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
RowMetadata preparationContent = mapper.readValue(inputStream, Data.class).metadata;
List<PatternFrequency> patternFrequencies = preparationContent.getColumns().get(8).getStatistics().getPatternFrequencies();
assertTrue(patternFrequencies.stream().map(//
PatternFrequency::getPattern).anyMatch("yyyy-MM-dd"::equals));
// create a preview of calculate time since action
PreviewAddParameters previewAddParameters = new PreviewAddParameters();
previewAddParameters.setDatasetId(datasetId);
previewAddParameters.setPreparationId(preparationId);
previewAddParameters.setTdpIds(Arrays.asList(1, 2, 3, 4, 5, 6, 7));
Action calculateTimeUntilAction = new Action();
calculateTimeUntilAction.setName(ComputeTimeSince.TIME_SINCE_ACTION_NAME);
MixedContentMap actionParameters = new MixedContentMap();
actionParameters.put("create_new_column", "true");
actionParameters.put("time_unit", "HOURS");
actionParameters.put("since_when", "now_server_side");
actionParameters.put("scope", "column");
actionParameters.put("column_id", "0008");
actionParameters.put("column_name", "Added At");
calculateTimeUntilAction.setParameters(actionParameters);
previewAddParameters.setActions(Collections.singletonList(calculateTimeUntilAction));
JsonPath jsonPath = given().contentType(//
ContentType.JSON).body(//
previewAddParameters).expect().statusCode(200).log().ifError().when().post(//
"/api/preparations/preview/add").jsonPath();
// check non empty value for the new column
assertEquals(//
"new preview column should contains values according to calculate time since action", //
0, jsonPath.getList("records.0009").stream().map(String::valueOf).filter(StringUtils::isBlank).count());
}
use of org.talend.dataprep.api.dataset.statistics.PatternFrequency in project data-prep by Talend.
the class RowMetadataUtilsTest method shouldGetMostUsedDatePattern.
@Test
public void shouldGetMostUsedDatePattern() {
// given
final ColumnMetadata columnMetadata = column().id(1).name("date").type(Type.DATE).build();
final List<PatternFrequency> patternFrequencies = columnMetadata.getStatistics().getPatternFrequencies();
patternFrequencies.add(new PatternFrequency("MM-dd-YYYY", 2));
patternFrequencies.add(new PatternFrequency("dd-YYYY", 4));
// when
final String mostUsedDatePattern = RowMetadataUtils.getMostUsedDatePattern(columnMetadata);
// then
assertEquals("dd-YYYY", mostUsedDatePattern);
}
Aggregations