Search in sources :

Example 1 with ColumnType

use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project etl by linkedpipes.

the class TableToRdfConfigurator method configure.

/**
 * Configure given {@link TableToRdf} convertor.
 *
 * @param tableToRdf
 * @param header
 * @param data Contains first data row, or ColumnType if type is already known.
 * @param numberOfLeadingEmpty Number of leading empty columns, this is useful for xsl-like.
 */
public static void configure(TableToRdf tableToRdf, List<String> header, List<Object> data, int numberOfLeadingEmpty) throws ParseFailed, LpException {
    // initial checks
    if (data == null) {
        throw new ParseFailed("First data row is null!");
    }
    if (header != null && header.size() != data.size()) {
        throw new ParseFailed("Diff number of cells in header (" + header.size() + ") and data (" + data.size() + ")");
    }
    // 
    final TableToRdfConfig config = tableToRdf.config;
    // clear configuration
    tableToRdf.baseUri = config.baseURI;
    tableToRdf.infoMap = null;
    tableToRdf.keyColumn = null;
    tableToRdf.nameToIndex = new HashMap<>();
    // prepare locals
    Map<String, ColumnInfo_V1> unused = new HashMap<>();
    unused.putAll(config.columnsInfo);
    List<ValueGenerator> valueGenerators = new ArrayList<>(data.size());
    // generate configuration - Column Mapping
    String keyTemplateStr = null;
    for (int index = 0; index < data.size(); index++) {
        // generate column name and add it to map
        final String columnName;
        if (header != null) {
            if (header.get(index) != null) {
                columnName = header.get(index);
            } else {
                LOG.info("Generated value used for column with 'null' name.");
                // use generated one - first is col1, col2 ...
                columnName = "col" + Integer.toString(index + 1);
            }
        } else {
            // use generated one - first is col1, col2 ...
            columnName = "col" + Integer.toString(index + 1);
        }
        LOG.debug("New column found '{}'", columnName);
        // check for null
        if (columnName == null) {
            LOG.warn("Column with name='null' is ignored.");
            continue;
        }
        // add column name
        tableToRdf.nameToIndex.put(columnName, index);
        // test for key
        if (config.keyColumn != null && !config.keyColumn.isEmpty() && !config.advancedKeyColumn && config.keyColumn.compareTo(columnName) == 0) {
            // we construct tempalte and use it
            keyTemplateStr = "<" + prepareAsUri("{", config) + columnName + "}>";
        }
        // check for user template
        final ColumnInfo_V1 columnInfo;
        if (config.columnsInfo.containsKey(columnName)) {
            // use user config
            columnInfo = config.columnsInfo.get(columnName);
            unused.remove(columnName);
        } else if (!config.generateNew) {
            // no new generation
            continue;
        } else {
            // generate new
            columnInfo = new ColumnInfo_V1();
        }
        // fill other values if needed
        if (columnInfo.getURI() == null) {
            columnInfo.setURI(config.baseURI + Utils.convertStringToIRIPart(columnName));
        } else {
            columnInfo.setURI(prepareAsUri(columnInfo.getURI(), config));
        }
        if (columnInfo.getType() == ColumnType.Auto) {
            if (index < numberOfLeadingEmpty) {
                // This is empty leading column, we use string
                // without warning.
                columnInfo.setType(ColumnType.String);
            } else if (config.autoAsStrings) {
                columnInfo.setType(ColumnType.String);
            } else {
                columnInfo.setType(guessType(columnName, data.get(index), columnInfo.isUseTypeFromDfb()));
            }
        }
        // generate tableToRdf configuration from 'columnInfo'
        final String template = generateTemplate(columnInfo, columnName);
        LOG.debug("Template for column '{}' is '{}'", columnName, template);
        // add to configuration
        valueGenerators.add(ValueGeneratorReplace.create(tableToRdf.valueFactory.createIRI(columnInfo.getURI()), template));
        // generate metadata about column - for now only labels
        if (config.generateLabels) {
            tableToRdf.outRdf.add(tableToRdf.valueFactory.createIRI(columnInfo.getURI()), RDFS.LABEL, tableToRdf.valueFactory.createLiteral(columnName));
        }
    }
    // key template
    if (config.advancedKeyColumn) {
        // we use keyColumn directly
        tableToRdf.keyColumn = ValueGeneratorReplace.create(null, config.keyColumn);
        tableToRdf.keyColumn.compile(tableToRdf.nameToIndex, tableToRdf.valueFactory);
    } else if (keyTemplateStr != null) {
        // we have consructed tempalte
        LOG.info("Key column template: {}", keyTemplateStr);
        tableToRdf.keyColumn = ValueGeneratorReplace.create(null, keyTemplateStr);
        tableToRdf.keyColumn.compile(tableToRdf.nameToIndex, tableToRdf.valueFactory);
    } else {
    // we use null, and then row number is used
    }
    // TODO: we do not support this functionality ..
    for (String key : unused.keySet()) {
        if (key.isEmpty()) {
            // - bug fix
            continue;
        }
        if (config.ignoreMissingColumn) {
            LOG.info("Column '{}' (uri:{}) ignored as does not match " + "original columns.", key, unused.get(key).getURI());
        } else {
            LOG.error("Column '{}' (uri:{}) ignored as does not match " + "original columns.", key, unused.get(key).getURI());
        }
    }
    // add advanced
    for (TabularConfig_V2.AdvanceMapping item : tableToRdf.config.columnsInfoAdv) {
        // prepare URI
        String uri = prepareAsUri(item.getUri(), config);
        // add tempalte
        valueGenerators.add(ValueGeneratorReplace.create(tableToRdf.valueFactory.createIRI(uri), item.getTemplate()));
    }
    // Compile valueGenerators
    for (ValueGenerator generator : valueGenerators) {
        generator.compile(tableToRdf.nameToIndex, tableToRdf.valueFactory);
    }
    // final checks and data sets
    tableToRdf.infoMap = valueGenerators.toArray(new ValueGenerator[0]);
    if (config.rowsClass != null && !config.rowsClass.isEmpty()) {
        try {
            tableToRdf.rowClass = tableToRdf.valueFactory.createIRI(config.rowsClass);
        } catch (IllegalArgumentException ex) {
            throw new ParseFailed("Failed to create row's class URI from:" + config.rowsClass, ex);
        }
    }
}
Also used : ValueGenerator(com.linkedpipes.plugin.transformer.tabularuv.column.ValueGenerator) TabularConfig_V2(com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2) ParseFailed(com.linkedpipes.plugin.transformer.tabularuv.parser.ParseFailed) ColumnInfo_V1(com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnInfo_V1)

Example 2 with ColumnType

use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project etl by linkedpipes.

the class ParserXls method parseSheet.

/**
 * Parse given sheet.
 *
 * @param wb
 * @param sheetIndex
 */
public void parseSheet(Workbook wb, Integer sheetIndex) throws ParseFailed, LpException {
    LOG.debug("parseSheet({}, {})", wb.getSheetName(sheetIndex), sheetIndex);
    // for every row
    final Sheet sheet = wb.getSheetAt(sheetIndex);
    if (config.numberOfStartLinesToIgnore > sheet.getLastRowNum()) {
        // no data to parse
        return;
    }
    // generate column names
    int startRow = config.numberOfStartLinesToIgnore;
    List<String> columnNames;
    // Size of original header from file, used to expand/strip content.
    Integer tableHeaderSize = null;
    if (config.hasHeader) {
        // parse line for header
        final Row row = sheet.getRow(startRow++);
        if (row == null) {
            throw new ParseFailed("Header row is null!");
        }
        final int columnStart = row.getFirstCellNum();
        final int columnEnd = row.getLastCellNum();
        columnNames = new ArrayList<>(columnEnd);
        for (int columnIndex = columnStart; columnIndex < columnEnd; columnIndex++) {
            final Cell cell = row.getCell(columnIndex);
            if (cell == null) {
                // The cell is missing, this happen for example if
                // document is exported from gdocs. We just log and use
                // 'null' as cell value.
                LOG.info("Header cell is null ({}, {}) on '{}'!", startRow - 1, columnIndex, wb.getSheetName(sheetIndex));
                columnNames.add(null);
            } else {
                final String name = this.getCellValue(cell);
                columnNames.add(name);
            }
        }
        if (config.stripHeader) {
            // Remove trailing null values.
            int initialSize = columnNames.size();
            for (int i = columnNames.size() - 1; i > 0; --i) {
                if (columnNames.get(i) == null) {
                    columnNames.remove(i);
                } else {
                    // Non null value.
                    break;
                }
            }
            LOG.info("Removal of nulls changed header size from {} to {}", initialSize, columnNames.size());
        } else {
            LOG.debug("Header size {}", columnNames.size());
        }
        // global names will be added later
        tableHeaderSize = columnNames.size();
    } else {
        columnNames = null;
    }
    // 
    // prepare static cells
    // 
    final List<String> namedCells = new LinkedList<>();
    for (NamedCell_V1 namedCell : config.namedCells) {
        final Row row = sheet.getRow(namedCell.getRowNumber() - 1);
        if (row == null) {
            throw new ParseFailed("Row for named cell is null! (" + namedCell.getName() + ")");
        }
        final Cell cell = row.getCell(namedCell.getColumnNumber() - 1);
        if (cell == null) {
            throw new ParseFailed("Cell for named cell is null! (" + namedCell.getName() + ")");
        }
        // get value and add to namedCells
        final String value = getCellValue(cell);
        LOG.debug("static cell {} = {}", namedCell.getName(), value);
        namedCells.add(value);
    }
    // 
    if (config.rowLimit == null) {
        LOG.debug("Row limit: not used");
    } else {
        LOG.debug("Row limit: {}", config.rowLimit);
    }
    // set if for first time or if we use static row counter
    if (!config.checkStaticRowCounter || rowNumber == 0) {
        rowNumber = config.hasHeader ? 2 : 1;
    }
    // go
    boolean headerGenerated = false;
    final int dataEndAtRow;
    if (config.rowLimit != null) {
        // limit number of lines
        dataEndAtRow = startRow + config.rowLimit;
    } else {
        // We increase by one, as we use less < dataEndAtRow,
        // not <= dataEndAtRow
        dataEndAtRow = sheet.getLastRowNum() + 1;
    }
    int skippedLinesCounter = 0;
    for (Integer rowNumPerFile = startRow; rowNumPerFile < dataEndAtRow; ++rowNumber, ++rowNumPerFile) {
        // skip till data
        if (rowNumPerFile < config.numberOfStartLinesToIgnore) {
            continue;
        }
        // get row
        final Row row = sheet.getRow(rowNumPerFile);
        if (row == null) {
            continue;
        }
        // We use zero as the first column must be column 1.
        final int columnStart = row.getFirstCellNum();
        final int columnEnd = row.getLastCellNum();
        // generate header
        if (!headerGenerated) {
            headerGenerated = true;
            // use row data to generate types
            final List<ColumnType> types = new ArrayList<>(columnEnd + namedCells.size());
            // to always start at the first column.
            for (int columnIndex = 0; columnIndex < columnEnd; columnIndex++) {
                final Cell cell = row.getCell(columnIndex);
                if (cell == null) {
                    types.add(null);
                    continue;
                }
                types.add(getCellType(cell));
            }
            // if header is presented.
            if (columnNames == null) {
                LOG.info("Generating column names from: {} to: {}", columnStart, columnEnd);
                columnNames = new ArrayList<>(columnEnd);
                // Generate column names, first column is col1. We start
                // from 0 as we always want start with left most column.
                // See comment before types generation for more info.
                int columnIndex = 0;
                for (int i = 0; i < columnEnd; i++) {
                    columnNames.add("col" + Integer.toString(++columnIndex));
                }
                tableHeaderSize = columnNames.size();
            } else {
                // expand types row. The header might be wider then the
                // first data row.
                fitToSize(types, tableHeaderSize);
            }
            // add user defined names
            for (NamedCell_V1 item : config.namedCells) {
                columnNames.add(item.getName());
                types.add(ColumnType.String);
            }
            // add global types and names
            columnNames.add(SHEET_COLUMN_NAME);
            types.add(ColumnType.String);
            // configure
            TableToRdfConfigurator.configure(tableToRdf, columnNames, (List) types, startRow);
        }
        // Prepare row.
        final List<String> parsedRow = new ArrayList<>(columnEnd + namedCells.size());
        // parse columns
        for (int columnIndex = 0; columnIndex < columnEnd; columnIndex++) {
            final Cell cell = row.getCell(columnIndex);
            if (cell == null) {
                parsedRow.add(null);
            } else {
                parsedRow.add(getCellValue(cell));
            }
        }
        // Check for row null values - this can happen for excel exported
        // from google docs, where the number oof declared data rows
        // is bigger then it should be together with fitToSize we would
        // generate non-existing columns. In order to prevent this
        // we scan an ignore lines with null values only.
        boolean isEmpty = true;
        for (Object value : parsedRow) {
            if (value != null) {
                isEmpty = false;
                break;
            }
        }
        if (isEmpty) {
            ++skippedLinesCounter;
            continue;
        }
        // expand row if needed
        fitToSize(parsedRow, tableHeaderSize);
        // add named columns first !!
        parsedRow.addAll(namedCells);
        // add global data
        parsedRow.add(wb.getSheetName(sheetIndex));
        // convert into table
        tableToRdf.paserRow((List) parsedRow, rowNumber);
        if ((rowNumPerFile % 1000) == 0) {
            LOG.debug("Row number {} processed.", rowNumPerFile);
        }
    }
    // 
    if (skippedLinesCounter != 0) {
        LOG.info("Some lines ({}) were skipped.", skippedLinesCounter);
    }
}
Also used : ColumnType(com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType) NamedCell_V1(com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.NamedCell_V1)

Example 3 with ColumnType

use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project molgenis-emx2 by molgenis.

the class RefAndRefArrayTestExample method createRefAndRefArrayTestExample.

public static void createRefAndRefArrayTestExample(SchemaMetadata schema) {
    ColumnType[] columnTypes = new ColumnType[] { UUID, STRING, BOOL, INT, DECIMAL, TEXT, DATE, DATETIME };
    for (ColumnType columnType : columnTypes) {
        String aTableName = columnType.toString() + "_A";
        String fieldName = "AKeyOf" + columnType;
        schema.create(table(aTableName).add(column(fieldName).setType(columnType).setPkey()));
        String bTableName = columnType.toString() + "_B";
        String refFromBToA = "RefToAKeyOf" + columnType;
        String refArrayFromBToA = "RefArrayToAKeyOf" + columnType;
        schema.create(table(bTableName).add(column(refFromBToA).setType(REF).setRefTable(aTableName)).add(column(refArrayFromBToA).setType(REF_ARRAY).setRefTable(aTableName)));
    }
}
Also used : ColumnType(org.molgenis.emx2.ColumnType)

Example 4 with ColumnType

use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project molgenis-emx2 by molgenis.

the class ArrayTypeTestExample method createSimpleTypeTest.

public static void createSimpleTypeTest(SchemaMetadata schema) {
    TableMetadata typeTestTable = schema.create(table("ArrayTypeTest"));
    typeTestTable.add(column("id").setPkey());
    ColumnType[] columnTypes = new ColumnType[] { UUID_ARRAY, STRING_ARRAY, BOOL_ARRAY, INT_ARRAY, DECIMAL_ARRAY, TEXT_ARRAY, DATE_ARRAY, DATETIME_ARRAY };
    for (ColumnType columnType : columnTypes) {
        typeTestTable.add(column("Test_" + columnType.toString().toLowerCase()).setType(columnType).setRequired(true));
        typeTestTable.add(column("Test_" + columnType.toString().toLowerCase() + "_nillable").setType(columnType));
    }
}
Also used : TableMetadata(org.molgenis.emx2.TableMetadata) ColumnType(org.molgenis.emx2.ColumnType)

Example 5 with ColumnType

use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project molgenis-emx2 by molgenis.

the class SimpleTypeTestExample method createSimpleTypeTest.

public static void createSimpleTypeTest(SchemaMetadata schema) {
    TableMetadata typeTestTable = table(TYPE_TEST).add(column("id").setPkey());
    ColumnType[] columnTypes = new ColumnType[] { UUID, STRING, BOOL, INT, DECIMAL, TEXT, DATE, DATETIME };
    for (ColumnType columnType : columnTypes) {
        typeTestTable.add(column("Test " + columnType.toString().toLowerCase()).setType(columnType).setRequired(true));
        typeTestTable.add(column("Test " + columnType.toString().toLowerCase() + " nillable").setType(columnType));
    }
    schema.create(typeTestTable);
}
Also used : TableMetadata(org.molgenis.emx2.TableMetadata) ColumnType(org.molgenis.emx2.ColumnType)

Aggregations

ColumnType (org.molgenis.emx2.ColumnType)11 Function (com.google.common.base.Function)10 Before (org.junit.Before)10 TableMetadata (org.molgenis.emx2.TableMetadata)5 ColumnType (com.google.api.ads.admanager.axis.v202105.ColumnType)2 ColumnType (com.google.api.ads.admanager.axis.v202108.ColumnType)2 ColumnType (com.google.api.ads.admanager.axis.v202111.ColumnType)2 ColumnType (com.google.api.ads.admanager.axis.v202202.ColumnType)2 ColumnType (com.google.api.ads.admanager.axis.v202205.ColumnType)2 ColumnType (com.google.api.ads.admanager.jaxws.v202108.ColumnType)2 ColumnType (com.google.api.ads.admanager.jaxws.v202202.ColumnType)2 ColumnType (com.google.api.ads.admanager.jaxws.v202205.ColumnType)2 ArrayList (java.util.ArrayList)2 Test (org.junit.Test)2 AdUnitTargeting (com.google.api.ads.admanager.axis.v202105.AdUnitTargeting)1 BooleanValue (com.google.api.ads.admanager.axis.v202105.BooleanValue)1 Date (com.google.api.ads.admanager.axis.v202105.Date)1 DateTime (com.google.api.ads.admanager.axis.v202105.DateTime)1 DateTimeValue (com.google.api.ads.admanager.axis.v202105.DateTimeValue)1 DateValue (com.google.api.ads.admanager.axis.v202105.DateValue)1