use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class XlsSchemaParserTest method parse_should_extract_single_sheet_xls.
@Test
public void parse_should_extract_single_sheet_xls() throws Exception {
// given
final String fileName = "simple.xls";
SchemaParser.Request request;
try (InputStream inputStream = this.getClass().getResourceAsStream(fileName)) {
request = getRequest(inputStream, "My Dataset");
// when
final Schema schema = parser.parse(request);
// then
assertThat(schema.getSheetContents(), is(notNullValue()));
assertThat(schema.draft(), is(false));
assertThat(schema.getSheetName(), is("Feuille1"));
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class DataSetService method preview.
/**
* Returns preview of the the data set content for given id (first 100 rows). Service might return
* {@link org.apache.http.HttpStatus#SC_ACCEPTED} if the data set exists but analysis is not yet fully
* completed so content is not yet ready to be served.
*
* @param metadata If <code>true</code>, includes data set metadata information.
* @param sheetName the sheet name to preview
* @param dataSetId A data set id.
*/
@RequestMapping(value = "/datasets/{id}/preview", method = RequestMethod.GET)
@ApiOperation(value = "Get a data preview set by id", notes = "Get a data set preview content based on provided id. Not valid or non existing data set id returns empty content. Data set not in drat status will return a redirect 301")
@Timed
@ResponseBody
public DataSet preview(@RequestParam(defaultValue = "true") @ApiParam(name = "metadata", value = "Include metadata information in the response") boolean metadata, @RequestParam(defaultValue = "") @ApiParam(name = "sheetName", value = "Sheet name to preview") String sheetName, @PathVariable(value = "id") @ApiParam(name = "id", value = "Id of the requested data set") String dataSetId) {
DataSetMetadata dataSetMetadata = dataSetMetadataRepository.get(dataSetId);
if (dataSetMetadata == null) {
HttpResponseContext.status(HttpStatus.NO_CONTENT);
// No data set, returns empty content.
return DataSet.empty();
}
if (!dataSetMetadata.isDraft()) {
// Moved to get data set content operation
HttpResponseContext.status(HttpStatus.MOVED_PERMANENTLY);
HttpResponseContext.header("Location", "/datasets/" + dataSetId + "/content");
// dataset not anymore a draft so preview doesn't make sense.
return DataSet.empty();
}
if (StringUtils.isNotEmpty(sheetName)) {
dataSetMetadata.setSheetName(sheetName);
}
// take care of previous data without schema parser result
if (dataSetMetadata.getSchemaParserResult() != null) {
// sheet not yet set correctly so use the first one
if (StringUtils.isEmpty(dataSetMetadata.getSheetName())) {
String theSheetName = dataSetMetadata.getSchemaParserResult().getSheetContents().get(0).getName();
LOG.debug("preview for dataSetMetadata: {} with sheetName: {}", dataSetId, theSheetName);
dataSetMetadata.setSheetName(theSheetName);
}
String theSheetName = dataSetMetadata.getSheetName();
Optional<Schema.SheetContent> sheetContentFound = dataSetMetadata.getSchemaParserResult().getSheetContents().stream().filter(sheetContent -> theSheetName.equals(sheetContent.getName())).findFirst();
if (!sheetContentFound.isPresent()) {
HttpResponseContext.status(HttpStatus.NO_CONTENT);
// No sheet found, returns empty content.
return DataSet.empty();
}
List<ColumnMetadata> columnMetadatas = sheetContentFound.get().getColumnMetadatas();
if (dataSetMetadata.getRowMetadata() == null) {
dataSetMetadata.setRowMetadata(new RowMetadata(emptyList()));
}
dataSetMetadata.getRowMetadata().setColumns(columnMetadatas);
} else {
LOG.warn("dataset#{} has draft status but any SchemaParserResult");
}
// Build the result
DataSet dataSet = new DataSet();
if (metadata) {
dataSet.setMetadata(conversionService.convert(dataSetMetadata, UserDataSetMetadata.class));
}
dataSet.setRecords(contentStore.stream(dataSetMetadata).limit(100));
return dataSet;
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class HtmlSchemaParser method parse.
/**
* @see SchemaParser#parse(Request)
*/
@Override
public Schema parse(Request request) {
try {
SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler();
InputStream inputStream = request.getContent();
HtmlParser htmlParser = new HtmlParser();
Metadata metadata = new Metadata();
htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext());
List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size());
for (String headerValue : headersContentHandler.getHeaderValues()) {
columns.add(//
ColumnMetadata.Builder.column().type(// ATM not doing any complicated type calculation
Type.STRING).name(//
headerValue).id(//
columns.size()).build());
}
Schema.SheetContent sheetContent = new Schema.SheetContent();
sheetContent.setColumnMetadatas(columns);
return //
Schema.Builder.parserResult().sheetContents(//
Collections.singletonList(sheetContent)).draft(//
false).build();
} catch (Exception e) {
LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e);
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class DataSetMetadataBuilderTest method createCompleteMetadata.
private DataSetMetadata createCompleteMetadata() {
final Map<String, String> parameters = new HashMap<>(0);
parameters.put("encoding", "UTF-8");
final List<ColumnMetadata> columnsMetadata = new ArrayList<>(2);
columnsMetadata.add(ColumnMetadata.Builder.column().id(0).name("id").type(INTEGER).build());
columnsMetadata.add(ColumnMetadata.Builder.column().id(1).name("Name").type(STRING).build());
final RowMetadata rowMetadata = new RowMetadata();
rowMetadata.setColumns(columnsMetadata);
final DataSetContent content = new DataSetContent();
content.setNbRecords(1000);
content.setLimit(1000L);
content.setNbLinesInHeader(10);
content.setNbLinesInFooter(10);
content.setFormatFamilyId("formatGuess#csv");
content.setMediaType("text/csv");
content.setParameters(parameters);
final Schema schemaParserResult = new Schema.Builder().draft(true).build();
final DataSetMetadata metadata = new DataSetMetadata("18ba64c154d5", "Avengers stats", "Stan Lee", System.currentTimeMillis(), System.currentTimeMillis(), rowMetadata, "1.0");
metadata.setLocation(new LocalStoreLocation());
metadata.getGovernance().setCertificationStep(CERTIFIED);
metadata.setSheetName("Sheet 1");
metadata.setDraft(true);
metadata.setContent(content);
metadata.setEncoding("UTF-8");
metadata.getLifecycle().contentIndexed(true);
metadata.getLifecycle().qualityAnalyzed(true);
metadata.getLifecycle().schemaAnalyzed(true);
metadata.getLifecycle().setInProgress(true);
metadata.getLifecycle().setImporting(true);
metadata.setSchemaParserResult(schemaParserResult);
metadata.setTag("MyTag");
return metadata;
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class CSVSchemaParserTest method TDP_2171.
@Test
public void TDP_2171() throws IOException {
try (InputStream inputStream = this.getClass().getResourceAsStream("TDP-2171.csv")) {
// We do know the format and therefore we go directly to the CSV schema guessing
SchemaParser.Request request = getRequest(inputStream, "#1");
request.getMetadata().setEncoding("UTF-8");
Schema schema = csvSchemaParser.parse(request);
final Map<String, String> parameters = request.getMetadata().getContent().getParameters();
char actual = parameters.get(CSVFormatFamily.SEPARATOR_PARAMETER).charAt(0);
List<String> header = csvFormatUtils.retrieveHeader(parameters);
assertEquals(',', actual);
assertEquals(92, header.size());
}
}
Aggregations