use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class XlsSchemaParser method parse.
/**
* @see SchemaParser#parse(Request)
*/
@Override
public Schema parse(Request request) {
final Marker marker = Markers.dataset(request.getMetadata().getId());
LOGGER.debug(marker, "parsing {} ");
try {
List<Schema.SheetContent> sheetContents = parseAllSheets(request);
Schema result;
if (!sheetContents.isEmpty()) {
// only one sheet
if (sheetContents.size() == 1) {
result = //
Schema.Builder.parserResult().sheetContents(//
sheetContents).draft(//
false).sheetName(//
sheetContents.get(0).getName()).build();
} else {
// multiple sheet, set draft flag on
result = //
Schema.Builder.parserResult().sheetContents(//
sheetContents).draft(//
true).sheetName(//
sheetContents.get(0).getName()).build();
}
} else // nothing to parse
{
throw new TDPException(DataSetErrorCodes.UNABLE_TO_READ_DATASET_CONTENT);
}
return result;
} catch (TDPException e) {
throw e;
} catch (Exception e) {
LOGGER.debug(marker, "IOException during parsing xls request :" + e.getMessage(), e);
throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class CSVSchemaParser method parse.
/**
* @param request container with information needed to parse the raw data.
* @return
*/
@Override
public Schema parse(Request request) {
List<Schema.SheetContent> sheetContents = new ArrayList<>();
sheetContents.add(new Schema.SheetContent(META_KEY, new ArrayList<>()));
try {
final DataSetMetadata metadata = request.getMetadata();
final Map<String, String> parameters = guess(request, metadata.getEncoding());
metadata.getContent().setParameters(parameters);
List<String> header = csvFormatUtils.retrieveHeader(parameters);
if (header == null || header.isEmpty()) {
throw new TDPException(DataSetErrorCodes.UNABLE_TO_READ_DATASET_CONTENT);
}
LOGGER.debug("Columns found: {}", header);
// By default, consider all columns as Strings (to be refined by deeper analysis).
LOGGER.debug("Setting default type for columns...");
int i = 0;
for (String column : header) {
//
sheetContents.stream().filter(sheetContent -> META_KEY.equals(sheetContent.getName())).findFirst().get().getColumnMetadatas().add(column().id(i++).name(column).type(Type.STRING).build());
}
} catch (Exception e) {
throw new TDPException(CommonErrorCodes.UNABLE_TO_READ_CONTENT, e);
}
return //
Schema.Builder.parserResult().sheetContents(//
sheetContents).draft(false).build();
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class HtmlSerializerTest method html_serializer_with_jira_export.
@Test
public void html_serializer_with_jira_export() throws Exception {
final SchemaParser.Request request;
final Schema result;
try (InputStream inputStream = this.getClass().getResourceAsStream("jira_export.xls")) {
// We do know the format and therefore we go directly to the HTML schema guessing
request = getRequest(inputStream, "#2");
request.getMetadata().setEncoding("UTF-16");
result = htmlSchemaGuesser.parse(request);
}
try (InputStream inputStream = this.getClass().getResourceAsStream("jira_export.xls")) {
final List<ColumnMetadata> columns = result.getSheetContents().get(0).getColumnMetadatas();
Assert.assertThat(columns.size(), is(98));
request.getMetadata().getRowMetadata().setColumns(columns);
InputStream jsonStream = htmlSerializer.serialize(inputStream, request.getMetadata(), -1);
String json = IOUtils.toString(jsonStream, UTF_8);
ObjectMapper mapper = new ObjectMapper();
CollectionType collectionType = mapper.getTypeFactory().constructCollectionType(ArrayList.class, TreeMap.class);
List<Map<String, String>> values = mapper.readValue(json, collectionType);
Map<String, String> row0 = values.get(0);
for (String s : row0.keySet()) {
row0.put(s, row0.get(s).trim());
}
Assertions.assertThat(row0).contains(MapEntry.entry("0001", "TDP-1"));
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class XlsSchemaParserTest method parse_should_extract_multi_sheet_xls.
@Test
public void parse_should_extract_multi_sheet_xls() throws Exception {
// given
final String fileName = "Talend_Desk-Tableau-Bord-011214.xls";
SchemaParser.Request request;
try (InputStream inputStream = this.getClass().getResourceAsStream(fileName)) {
request = getRequest(inputStream, "My Dataset");
// when
final Schema schema = parser.parse(request);
// then
assertThat(schema.getSheetContents(), is(notNullValue()));
assertThat(schema.draft(), is(true));
assertThat(schema.getSheetName(), is("Sumary"));
}
}
use of org.talend.dataprep.schema.Schema in project data-prep by Talend.
the class XlsSchemaParserTest method checkColumnsName.
/**
* Load the excel file and check the parsed columns name against the given ones.
*
* @param inputStream the excel file name as inputStream
* @param expectedColsName the expected columns name.
* @throws IOException if an error occurs while reading the excel file.
*/
private void checkColumnsName(InputStream inputStream, String... expectedColsName) throws IOException {
DataSetMetadata datasetMetadata = ioTestUtils.getSimpleDataSetMetadata();
Schema result = parser.parse(new SchemaParser.Request(inputStream, datasetMetadata));
List<ColumnMetadata> columns = result.getSheetContents().get(0).getColumnMetadatas();
final List<String> actual = columns.stream().map(ColumnMetadata::getName).collect(Collectors.toList());
assertThat(actual).containsExactly(expectedColsName);
}
Aggregations