use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class SchemaAnalysisTest method initializeDataSetMetadata.
/**
* Initialize a dataset with the given content. Perform the format and the schema analysis.
*
* @param content the dataset content.
* @return the analyzed dataset metadata.
*/
private DataSetMetadata initializeDataSetMetadata(InputStream content) {
String id = UUID.randomUUID().toString();
final DataSetMetadata metadata = metadataBuilder.metadata().id(id).build();
dataSetMetadataRepository.save(metadata);
contentStore.storeAsRaw(metadata, content);
formatAnalysis.analyze(id);
contentAnalysis.analyze(id);
// Analyze schema
schemaAnalysis.analyze(id);
final DataSetMetadata analyzed = dataSetMetadataRepository.get(id);
assertNotNull(analyzed);
assertNotNull(analyzed.getLifecycle());
assertThat(analyzed.getLifecycle().schemaAnalyzed(), is(true));
return analyzed;
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class SchemaAnalysisTest method testTDP_471.
/**
* See <a href="https://jira.talendforge.org/browse/TDP-471">https://jira.talendforge.org/browse/TDP-471</a>.
*
* @throws Exception
*/
@Test
public void testTDP_471() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../semantic_type_threshold.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String[] expectedNames = { "gender_column" };
Type[] expectedTypes = { Type.INTEGER };
String[] expectedDomains = { "" };
int i = 0;
for (ColumnMetadata column : actual.getRowMetadata().getColumns()) {
assertThat(column.getName(), is(expectedNames[i]));
assertThat(column.getType(), is(expectedTypes[i].getName()));
assertThat(column.getDomain(), is(expectedDomains[i++]));
assertThat(column.getSemanticDomains()).isNotNull().isNotEmpty().hasSize(2).contains(//
new SemanticDomain("GENDER", "Gender", (float) 35), new SemanticDomain("CIVILITY", "Civility", (float) 20.833334));
}
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class SchemaAnalysisTest method testTDP_855.
/**
* See <a href="https://jira.talendforge.org/browse/TDP-855">TDP-855_movie_title_detected_as_city</a>.
*/
@Test
public void testTDP_855() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../TDP-855_movie_title_detected_as_city.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
ColumnMetadata column = actual.getRowMetadata().getColumns().get(0);
String expectedName = "Genre: (Movie, Program, show)";
Type expectedType = Type.STRING;
assertThat(expectedName, is(column.getName()));
assertThat(expectedType.getName(), is(column.getType()));
assertThat("", is(column.getDomainLabel()));
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class SchemaAnalysisTest method testAnalysis.
@Test
public void testAnalysis() {
final DataSetMetadata actual = initializeDataSetMetadata(DataSetServiceTest.class.getResourceAsStream("../avengers.csv"));
assertThat(actual.getLifecycle().schemaAnalyzed(), is(true));
String[] expectedNames = { "nickname", "secret firstname", "secret lastname", "date of birth", "city" };
Type[] expectedTypes = { Type.STRING, Type.STRING, Type.STRING, Type.DATE, Type.STRING };
int i = 0;
int j = 0;
for (ColumnMetadata column : actual.getRowMetadata().getColumns()) {
assertThat(column.getName(), is(expectedNames[i++]));
assertThat(column.getType(), is(expectedTypes[j++].getName()));
}
}
use of org.talend.dataprep.api.dataset.DataSetMetadata in project data-prep by Talend.
the class TransformationService method getSemanticDomains.
/**
* Return the semantic domains for the given parameters.
*
* @param metadata the dataset metadata.
* @param columnId the column id to analyze.
* @param records the dataset records.
* @return the semantic domains for the given parameters.
* @throws IOException can happen...
*/
private List<SemanticDomain> getSemanticDomains(DataSetMetadata metadata, String columnId, InputStream records) throws IOException {
// copy the column metadata and set the semantic domain forced flag to false to make sure the statistics adapter set all
// available domains
final ColumnMetadata columnMetadata = //
column().copy(//
metadata.getRowMetadata().getById(columnId)).semanticDomainForce(//
false).build();
final Analyzer<Analyzers.Result> analyzer = analyzerService.build(columnMetadata, SEMANTIC);
analyzer.init();
try (final JsonParser parser = mapper.getFactory().createParser(new InputStreamReader(records, UTF_8))) {
final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser);
dataSet.getRecords().map(//
r -> r.get(columnId)).forEach(analyzer::analyze);
analyzer.end();
}
final List<Analyzers.Result> analyzerResult = analyzer.getResult();
statisticsAdapter.adapt(singletonList(columnMetadata), analyzerResult);
return columnMetadata.getSemanticDomains();
}
Aggregations