Search in sources :

Example 1 with Schema

use of org.talend.dataprep.schema.Schema in project data-prep by Talend.

the class XlsSchemaParser method parse.

/**
 * @see SchemaParser#parse(Request)
 */
@Override
public Schema parse(Request request) {
    final Marker marker = Markers.dataset(request.getMetadata().getId());
    LOGGER.debug(marker, "parsing {} ");
    try {
        List<Schema.SheetContent> sheetContents = parseAllSheets(request);
        Schema result;
        if (!sheetContents.isEmpty()) {
            // only one sheet
            if (sheetContents.size() == 1) {
                result = // 
                Schema.Builder.parserResult().sheetContents(// 
                sheetContents).draft(// 
                false).sheetName(// 
                sheetContents.get(0).getName()).build();
            } else {
                // multiple sheet, set draft flag on
                result = // 
                Schema.Builder.parserResult().sheetContents(// 
                sheetContents).draft(// 
                true).sheetName(// 
                sheetContents.get(0).getName()).build();
            }
        } else // nothing to parse
        {
            throw new TDPException(DataSetErrorCodes.UNABLE_TO_READ_DATASET_CONTENT);
        }
        return result;
    } catch (TDPException e) {
        throw e;
    } catch (Exception e) {
        LOGGER.debug(marker, "IOException during parsing xls request :" + e.getMessage(), e);
        throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
    }
}
Also used : TDPException(org.talend.dataprep.exception.TDPException) Schema(org.talend.dataprep.schema.Schema) Marker(org.slf4j.Marker) TDPException(org.talend.dataprep.exception.TDPException) IOException(java.io.IOException)

Example 2 with Schema

use of org.talend.dataprep.schema.Schema in project data-prep by Talend.

the class CSVSchemaParser method parse.

/**
 * @param request container with information needed to parse the raw data.
 * @return
 */
@Override
public Schema parse(Request request) {
    List<Schema.SheetContent> sheetContents = new ArrayList<>();
    sheetContents.add(new Schema.SheetContent(META_KEY, new ArrayList<>()));
    try {
        final DataSetMetadata metadata = request.getMetadata();
        final Map<String, String> parameters = guess(request, metadata.getEncoding());
        metadata.getContent().setParameters(parameters);
        List<String> header = csvFormatUtils.retrieveHeader(parameters);
        if (header == null || header.isEmpty()) {
            throw new TDPException(DataSetErrorCodes.UNABLE_TO_READ_DATASET_CONTENT);
        }
        LOGGER.debug("Columns found: {}", header);
        // By default, consider all columns as Strings (to be refined by deeper analysis).
        LOGGER.debug("Setting default type for columns...");
        int i = 0;
        for (String column : header) {
            // 
            sheetContents.stream().filter(sheetContent -> META_KEY.equals(sheetContent.getName())).findFirst().get().getColumnMetadatas().add(column().id(i++).name(column).type(Type.STRING).build());
        }
    } catch (Exception e) {
        throw new TDPException(CommonErrorCodes.UNABLE_TO_READ_CONTENT, e);
    }
    return // 
    Schema.Builder.parserResult().sheetContents(// 
    sheetContents).draft(false).build();
}
Also used : TDPException(org.talend.dataprep.exception.TDPException) Schema(org.talend.dataprep.schema.Schema) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata) TDPException(org.talend.dataprep.exception.TDPException)

Example 3 with Schema

use of org.talend.dataprep.schema.Schema in project data-prep by Talend.

the class HtmlSerializerTest method html_serializer_with_jira_export.

@Test
public void html_serializer_with_jira_export() throws Exception {
    final SchemaParser.Request request;
    final Schema result;
    try (InputStream inputStream = this.getClass().getResourceAsStream("jira_export.xls")) {
        // We do know the format and therefore we go directly to the HTML schema guessing
        request = getRequest(inputStream, "#2");
        request.getMetadata().setEncoding("UTF-16");
        result = htmlSchemaGuesser.parse(request);
    }
    try (InputStream inputStream = this.getClass().getResourceAsStream("jira_export.xls")) {
        final List<ColumnMetadata> columns = result.getSheetContents().get(0).getColumnMetadatas();
        Assert.assertThat(columns.size(), is(98));
        request.getMetadata().getRowMetadata().setColumns(columns);
        InputStream jsonStream = htmlSerializer.serialize(inputStream, request.getMetadata(), -1);
        String json = IOUtils.toString(jsonStream, UTF_8);
        ObjectMapper mapper = new ObjectMapper();
        CollectionType collectionType = mapper.getTypeFactory().constructCollectionType(ArrayList.class, TreeMap.class);
        List<Map<String, String>> values = mapper.readValue(json, collectionType);
        Map<String, String> row0 = values.get(0);
        for (String s : row0.keySet()) {
            row0.put(s, row0.get(s).trim());
        }
        Assertions.assertThat(row0).contains(MapEntry.entry("0001", "TDP-1"));
    }
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) InputStream(java.io.InputStream) Schema(org.talend.dataprep.schema.Schema) CollectionType(com.fasterxml.jackson.databind.type.CollectionType) SchemaParser(org.talend.dataprep.schema.SchemaParser) TreeMap(java.util.TreeMap) Map(java.util.Map) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.junit.Test)

Example 4 with Schema

use of org.talend.dataprep.schema.Schema in project data-prep by Talend.

the class XlsSchemaParserTest method parse_should_extract_multi_sheet_xls.

@Test
public void parse_should_extract_multi_sheet_xls() throws Exception {
    // given
    final String fileName = "Talend_Desk-Tableau-Bord-011214.xls";
    SchemaParser.Request request;
    try (InputStream inputStream = this.getClass().getResourceAsStream(fileName)) {
        request = getRequest(inputStream, "My Dataset");
        // when
        final Schema schema = parser.parse(request);
        // then
        assertThat(schema.getSheetContents(), is(notNullValue()));
        assertThat(schema.draft(), is(true));
        assertThat(schema.getSheetName(), is("Sumary"));
    }
}
Also used : InputStream(java.io.InputStream) Schema(org.talend.dataprep.schema.Schema) SchemaParser(org.talend.dataprep.schema.SchemaParser) Test(org.junit.Test)

Example 5 with Schema

use of org.talend.dataprep.schema.Schema in project data-prep by Talend.

the class XlsSchemaParserTest method checkColumnsName.

/**
 * Load the excel file and check the parsed columns name against the given ones.
 *
 * @param inputStream the excel file name as inputStream
 * @param expectedColsName the expected columns name.
 * @throws IOException if an error occurs while reading the excel file.
 */
private void checkColumnsName(InputStream inputStream, String... expectedColsName) throws IOException {
    DataSetMetadata datasetMetadata = ioTestUtils.getSimpleDataSetMetadata();
    Schema result = parser.parse(new SchemaParser.Request(inputStream, datasetMetadata));
    List<ColumnMetadata> columns = result.getSheetContents().get(0).getColumnMetadatas();
    final List<String> actual = columns.stream().map(ColumnMetadata::getName).collect(Collectors.toList());
    assertThat(actual).containsExactly(expectedColsName);
}
Also used : ColumnMetadata(org.talend.dataprep.api.dataset.ColumnMetadata) Schema(org.talend.dataprep.schema.Schema) SchemaParser(org.talend.dataprep.schema.SchemaParser) DataSetMetadata(org.talend.dataprep.api.dataset.DataSetMetadata)

Aggregations

Schema (org.talend.dataprep.schema.Schema)14 InputStream (java.io.InputStream)10 SchemaParser (org.talend.dataprep.schema.SchemaParser)8 Test (org.junit.Test)7 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)7 DataSetMetadata (org.talend.dataprep.api.dataset.DataSetMetadata)5 TDPException (org.talend.dataprep.exception.TDPException)5 Map (java.util.Map)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 CollectionType (com.fasterxml.jackson.databind.type.CollectionType)2 Api (io.swagger.annotations.Api)2 ApiOperation (io.swagger.annotations.ApiOperation)2 ApiParam (io.swagger.annotations.ApiParam)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 OutputStream (java.io.OutputStream)2 PipedInputStream (java.io.PipedInputStream)2 PipedOutputStream (java.io.PipedOutputStream)2