use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class FormatAnalysisTest method testXLSXAnalysis.
@Test
public void testXLSXAnalysis() {
String id = UUID.randomUUID().toString();
final DataSetMetadata metadata = metadataBuilder.metadata().id(id).build();
dataSetMetadataRepository.save(metadata);
contentStore.storeAsRaw(metadata, DataSetServiceTest.class.getResourceAsStream("../tagada.xls"));
formatAnalysis.analyze(id);
final DataSetMetadata actual = dataSetMetadataRepository.get(id);
assertThat(actual, notNullValue());
assertThat(actual.getContent().getFormatFamilyId(), is(XlsFormatFamily.BEAN_ID));
assertThat(actual.getContent().getMediaType(), is("application/vnd.ms-excel"));
assertThat(actual.getContent().getParameters().isEmpty(), is(true));
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class FormatAnalysisTest method testUpdate.
@Test
public void testUpdate() {
String id = UUID.randomUUID().toString();
final DataSetMetadata metadata = metadataBuilder.metadata().id(id).build();
dataSetMetadataRepository.save(metadata);
contentStore.storeAsRaw(metadata, DataSetServiceTest.class.getResourceAsStream("../avengers.csv"));
formatAnalysis.analyze(id);
final DataSetMetadata original = dataSetMetadataRepository.get(id);
final DataSetMetadata modified = dataSetMetadataRepository.get(id);
modified.setEncoding("windows-1252");
modified.getContent().getParameters().put("SEPARATOR", ",");
formatAnalysis.update(original, modified);
final DataSetMetadata updated = dataSetMetadataRepository.get(id);
assertNotNull(updated);
assertThat(updated.getContent().getFormatFamilyId(), is(CSVFormatFamily.BEAN_ID));
assertThat(updated.getContent().getMediaType(), is("text/csv"));
assertThat(updated.getEncoding(), is("windows-1252"));
// assertThat(updated.getContent().getParameters().get("SEPARATOR"), is(";"));
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class QualityAnalysisTest method TDP_1150_string_must_be_detected_as_so_if_even_if_subtype_is_integer.
/**
* This test ensures that string is detected as type even if we use the sub type (integer) of the most frequent type
* (String) to detect invalids.
*
* See <a href="https://jira.talendforge.org/browse/TDP-224">https://jira.talendforge.org/browse/TDP-1150</a>.
*
* @throws Exception
*/
@Test
public void TDP_1150_string_must_be_detected_as_so_if_even_if_subtype_is_integer() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../valid_must_be_text1.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String expectedName = "user_id";
Type expectedType = Type.STRING;
ColumnMetadata column = actual.getRowMetadata().getColumns().get(0);
assertThat(column.getName(), is(expectedName));
assertThat(column.getType(), is(expectedType.getName()));
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class QualityAnalysisTest method TDP_1150_full.
/**
* This test ensures that data types have been rightly detected when performing a full analysis.
*
* See <a href="https://jira.talendforge.org/browse/TDP-224">https://jira.talendforge.org/browse/TDP-1150</a>.
*
* @throws Exception
*/
@Test
public void TDP_1150_full() {
// given
String[] expectedNames = { //
"string_boolean", //
"double_integer", //
"string_integer", //
"string_double", //
"string_date", //
"type_mix", //
"boolean", //
"integer", //
"double", //
"date", //
"string", //
"empty" };
Type[] expectedTypes = { //
Type.BOOLEAN, //
Type.DOUBLE, //
Type.INTEGER, //
Type.DOUBLE, //
Type.DATE, //
Type.STRING, //
Type.BOOLEAN, //
Type.INTEGER, //
Type.DOUBLE, //
Type.DATE, //
Type.STRING, //
Type.STRING };
// when
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../invalids_and_type_detection.csv"));
// then
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
for (int i = 0; i < expectedTypes.length; i++) {
ColumnMetadata column = actual.getRowMetadata().getColumns().get(i);
assertThat(column.getName(), is(expectedNames[i]));
assertThat("column '" + column.getName() + "' is expected to be detected as '" + expectedTypes[i] + "' but was found as '" + column.getType() + //
"'", //
column.getType(), //
is(expectedTypes[i].getName()));
}
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class QualityAnalysisTest method TDP_1150_integer_must_be_detected_as_so_even_if_sampling_detects_text.
/**
* This test ensures that data types have been rightly detected when performing a full analysis.
*
* See <a href="https://jira.talendforge.org/browse/TDP-224">https://jira.talendforge.org/browse/TDP-1150</a>.
*
* @throws Exception
*/
@Test
public void TDP_1150_integer_must_be_detected_as_so_even_if_sampling_detects_text() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../valid_must_be_integer.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String expectedName = "user_id";
Type expectedType = Type.INTEGER;
ColumnMetadata column = actual.getRowMetadata().getColumns().get(0);
assertThat(column.getName(), is(expectedName));
assertThat(column.getType(), is(expectedType.getName()));
}
Aggregations