use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project etl by linkedpipes.
the class TableToRdfConfigurator method configure.
/**
* Configure given {@link TableToRdf} convertor.
*
* @param tableToRdf
* @param header
* @param data Contains first data row, or ColumnType if type is already known.
* @param numberOfLeadingEmpty Number of leading empty columns, this is useful for xsl-like.
*/
public static void configure(TableToRdf tableToRdf, List<String> header, List<Object> data, int numberOfLeadingEmpty) throws ParseFailed, LpException {
// initial checks
if (data == null) {
throw new ParseFailed("First data row is null!");
}
if (header != null && header.size() != data.size()) {
throw new ParseFailed("Diff number of cells in header (" + header.size() + ") and data (" + data.size() + ")");
}
//
final TableToRdfConfig config = tableToRdf.config;
// clear configuration
tableToRdf.baseUri = config.baseURI;
tableToRdf.infoMap = null;
tableToRdf.keyColumn = null;
tableToRdf.nameToIndex = new HashMap<>();
// prepare locals
Map<String, ColumnInfo_V1> unused = new HashMap<>();
unused.putAll(config.columnsInfo);
List<ValueGenerator> valueGenerators = new ArrayList<>(data.size());
// generate configuration - Column Mapping
String keyTemplateStr = null;
for (int index = 0; index < data.size(); index++) {
// generate column name and add it to map
final String columnName;
if (header != null) {
if (header.get(index) != null) {
columnName = header.get(index);
} else {
LOG.info("Generated value used for column with 'null' name.");
// use generated one - first is col1, col2 ...
columnName = "col" + Integer.toString(index + 1);
}
} else {
// use generated one - first is col1, col2 ...
columnName = "col" + Integer.toString(index + 1);
}
LOG.debug("New column found '{}'", columnName);
// check for null
if (columnName == null) {
LOG.warn("Column with name='null' is ignored.");
continue;
}
// add column name
tableToRdf.nameToIndex.put(columnName, index);
// test for key
if (config.keyColumn != null && !config.keyColumn.isEmpty() && !config.advancedKeyColumn && config.keyColumn.compareTo(columnName) == 0) {
// we construct tempalte and use it
keyTemplateStr = "<" + prepareAsUri("{", config) + columnName + "}>";
}
// check for user template
final ColumnInfo_V1 columnInfo;
if (config.columnsInfo.containsKey(columnName)) {
// use user config
columnInfo = config.columnsInfo.get(columnName);
unused.remove(columnName);
} else if (!config.generateNew) {
// no new generation
continue;
} else {
// generate new
columnInfo = new ColumnInfo_V1();
}
// fill other values if needed
if (columnInfo.getURI() == null) {
columnInfo.setURI(config.baseURI + Utils.convertStringToIRIPart(columnName));
} else {
columnInfo.setURI(prepareAsUri(columnInfo.getURI(), config));
}
if (columnInfo.getType() == ColumnType.Auto) {
if (index < numberOfLeadingEmpty) {
// This is empty leading column, we use string
// without warning.
columnInfo.setType(ColumnType.String);
} else if (config.autoAsStrings) {
columnInfo.setType(ColumnType.String);
} else {
columnInfo.setType(guessType(columnName, data.get(index), columnInfo.isUseTypeFromDfb()));
}
}
// generate tableToRdf configuration from 'columnInfo'
final String template = generateTemplate(columnInfo, columnName);
LOG.debug("Template for column '{}' is '{}'", columnName, template);
// add to configuration
valueGenerators.add(ValueGeneratorReplace.create(tableToRdf.valueFactory.createIRI(columnInfo.getURI()), template));
// generate metadata about column - for now only labels
if (config.generateLabels) {
tableToRdf.outRdf.add(tableToRdf.valueFactory.createIRI(columnInfo.getURI()), RDFS.LABEL, tableToRdf.valueFactory.createLiteral(columnName));
}
}
// key template
if (config.advancedKeyColumn) {
// we use keyColumn directly
tableToRdf.keyColumn = ValueGeneratorReplace.create(null, config.keyColumn);
tableToRdf.keyColumn.compile(tableToRdf.nameToIndex, tableToRdf.valueFactory);
} else if (keyTemplateStr != null) {
// we have consructed tempalte
LOG.info("Key column template: {}", keyTemplateStr);
tableToRdf.keyColumn = ValueGeneratorReplace.create(null, keyTemplateStr);
tableToRdf.keyColumn.compile(tableToRdf.nameToIndex, tableToRdf.valueFactory);
} else {
// we use null, and then row number is used
}
// TODO: we do not support this functionality ..
for (String key : unused.keySet()) {
if (key.isEmpty()) {
// - bug fix
continue;
}
if (config.ignoreMissingColumn) {
LOG.info("Column '{}' (uri:{}) ignored as does not match " + "original columns.", key, unused.get(key).getURI());
} else {
LOG.error("Column '{}' (uri:{}) ignored as does not match " + "original columns.", key, unused.get(key).getURI());
}
}
// add advanced
for (TabularConfig_V2.AdvanceMapping item : tableToRdf.config.columnsInfoAdv) {
// prepare URI
String uri = prepareAsUri(item.getUri(), config);
// add tempalte
valueGenerators.add(ValueGeneratorReplace.create(tableToRdf.valueFactory.createIRI(uri), item.getTemplate()));
}
// Compile valueGenerators
for (ValueGenerator generator : valueGenerators) {
generator.compile(tableToRdf.nameToIndex, tableToRdf.valueFactory);
}
// final checks and data sets
tableToRdf.infoMap = valueGenerators.toArray(new ValueGenerator[0]);
if (config.rowsClass != null && !config.rowsClass.isEmpty()) {
try {
tableToRdf.rowClass = tableToRdf.valueFactory.createIRI(config.rowsClass);
} catch (IllegalArgumentException ex) {
throw new ParseFailed("Failed to create row's class URI from:" + config.rowsClass, ex);
}
}
}
use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project etl by linkedpipes.
the class ParserXls method parseSheet.
/**
* Parse given sheet.
*
* @param wb
* @param sheetIndex
*/
public void parseSheet(Workbook wb, Integer sheetIndex) throws ParseFailed, LpException {
LOG.debug("parseSheet({}, {})", wb.getSheetName(sheetIndex), sheetIndex);
// for every row
final Sheet sheet = wb.getSheetAt(sheetIndex);
if (config.numberOfStartLinesToIgnore > sheet.getLastRowNum()) {
// no data to parse
return;
}
// generate column names
int startRow = config.numberOfStartLinesToIgnore;
List<String> columnNames;
// Size of original header from file, used to expand/strip content.
Integer tableHeaderSize = null;
if (config.hasHeader) {
// parse line for header
final Row row = sheet.getRow(startRow++);
if (row == null) {
throw new ParseFailed("Header row is null!");
}
final int columnStart = row.getFirstCellNum();
final int columnEnd = row.getLastCellNum();
columnNames = new ArrayList<>(columnEnd);
for (int columnIndex = columnStart; columnIndex < columnEnd; columnIndex++) {
final Cell cell = row.getCell(columnIndex);
if (cell == null) {
// The cell is missing, this happen for example if
// document is exported from gdocs. We just log and use
// 'null' as cell value.
LOG.info("Header cell is null ({}, {}) on '{}'!", startRow - 1, columnIndex, wb.getSheetName(sheetIndex));
columnNames.add(null);
} else {
final String name = this.getCellValue(cell);
columnNames.add(name);
}
}
if (config.stripHeader) {
// Remove trailing null values.
int initialSize = columnNames.size();
for (int i = columnNames.size() - 1; i > 0; --i) {
if (columnNames.get(i) == null) {
columnNames.remove(i);
} else {
// Non null value.
break;
}
}
LOG.info("Removal of nulls changed header size from {} to {}", initialSize, columnNames.size());
} else {
LOG.debug("Header size {}", columnNames.size());
}
// global names will be added later
tableHeaderSize = columnNames.size();
} else {
columnNames = null;
}
//
// prepare static cells
//
final List<String> namedCells = new LinkedList<>();
for (NamedCell_V1 namedCell : config.namedCells) {
final Row row = sheet.getRow(namedCell.getRowNumber() - 1);
if (row == null) {
throw new ParseFailed("Row for named cell is null! (" + namedCell.getName() + ")");
}
final Cell cell = row.getCell(namedCell.getColumnNumber() - 1);
if (cell == null) {
throw new ParseFailed("Cell for named cell is null! (" + namedCell.getName() + ")");
}
// get value and add to namedCells
final String value = getCellValue(cell);
LOG.debug("static cell {} = {}", namedCell.getName(), value);
namedCells.add(value);
}
//
if (config.rowLimit == null) {
LOG.debug("Row limit: not used");
} else {
LOG.debug("Row limit: {}", config.rowLimit);
}
// set if for first time or if we use static row counter
if (!config.checkStaticRowCounter || rowNumber == 0) {
rowNumber = config.hasHeader ? 2 : 1;
}
// go
boolean headerGenerated = false;
final int dataEndAtRow;
if (config.rowLimit != null) {
// limit number of lines
dataEndAtRow = startRow + config.rowLimit;
} else {
// We increase by one, as we use less < dataEndAtRow,
// not <= dataEndAtRow
dataEndAtRow = sheet.getLastRowNum() + 1;
}
int skippedLinesCounter = 0;
for (Integer rowNumPerFile = startRow; rowNumPerFile < dataEndAtRow; ++rowNumber, ++rowNumPerFile) {
// skip till data
if (rowNumPerFile < config.numberOfStartLinesToIgnore) {
continue;
}
// get row
final Row row = sheet.getRow(rowNumPerFile);
if (row == null) {
continue;
}
// We use zero as the first column must be column 1.
final int columnStart = row.getFirstCellNum();
final int columnEnd = row.getLastCellNum();
// generate header
if (!headerGenerated) {
headerGenerated = true;
// use row data to generate types
final List<ColumnType> types = new ArrayList<>(columnEnd + namedCells.size());
// to always start at the first column.
for (int columnIndex = 0; columnIndex < columnEnd; columnIndex++) {
final Cell cell = row.getCell(columnIndex);
if (cell == null) {
types.add(null);
continue;
}
types.add(getCellType(cell));
}
// if header is presented.
if (columnNames == null) {
LOG.info("Generating column names from: {} to: {}", columnStart, columnEnd);
columnNames = new ArrayList<>(columnEnd);
// Generate column names, first column is col1. We start
// from 0 as we always want start with left most column.
// See comment before types generation for more info.
int columnIndex = 0;
for (int i = 0; i < columnEnd; i++) {
columnNames.add("col" + Integer.toString(++columnIndex));
}
tableHeaderSize = columnNames.size();
} else {
// expand types row. The header might be wider then the
// first data row.
fitToSize(types, tableHeaderSize);
}
// add user defined names
for (NamedCell_V1 item : config.namedCells) {
columnNames.add(item.getName());
types.add(ColumnType.String);
}
// add global types and names
columnNames.add(SHEET_COLUMN_NAME);
types.add(ColumnType.String);
// configure
TableToRdfConfigurator.configure(tableToRdf, columnNames, (List) types, startRow);
}
// Prepare row.
final List<String> parsedRow = new ArrayList<>(columnEnd + namedCells.size());
// parse columns
for (int columnIndex = 0; columnIndex < columnEnd; columnIndex++) {
final Cell cell = row.getCell(columnIndex);
if (cell == null) {
parsedRow.add(null);
} else {
parsedRow.add(getCellValue(cell));
}
}
// Check for row null values - this can happen for excel exported
// from google docs, where the number oof declared data rows
// is bigger then it should be together with fitToSize we would
// generate non-existing columns. In order to prevent this
// we scan an ignore lines with null values only.
boolean isEmpty = true;
for (Object value : parsedRow) {
if (value != null) {
isEmpty = false;
break;
}
}
if (isEmpty) {
++skippedLinesCounter;
continue;
}
// expand row if needed
fitToSize(parsedRow, tableHeaderSize);
// add named columns first !!
parsedRow.addAll(namedCells);
// add global data
parsedRow.add(wb.getSheetName(sheetIndex));
// convert into table
tableToRdf.paserRow((List) parsedRow, rowNumber);
if ((rowNumPerFile % 1000) == 0) {
LOG.debug("Row number {} processed.", rowNumPerFile);
}
}
//
if (skippedLinesCounter != 0) {
LOG.info("Some lines ({}) were skipped.", skippedLinesCounter);
}
}
use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project molgenis-emx2 by molgenis.
the class RefAndRefArrayTestExample method createRefAndRefArrayTestExample.
public static void createRefAndRefArrayTestExample(SchemaMetadata schema) {
ColumnType[] columnTypes = new ColumnType[] { UUID, STRING, BOOL, INT, DECIMAL, TEXT, DATE, DATETIME };
for (ColumnType columnType : columnTypes) {
String aTableName = columnType.toString() + "_A";
String fieldName = "AKeyOf" + columnType;
schema.create(table(aTableName).add(column(fieldName).setType(columnType).setPkey()));
String bTableName = columnType.toString() + "_B";
String refFromBToA = "RefToAKeyOf" + columnType;
String refArrayFromBToA = "RefArrayToAKeyOf" + columnType;
schema.create(table(bTableName).add(column(refFromBToA).setType(REF).setRefTable(aTableName)).add(column(refArrayFromBToA).setType(REF_ARRAY).setRefTable(aTableName)));
}
}
use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project molgenis-emx2 by molgenis.
the class ArrayTypeTestExample method createSimpleTypeTest.
public static void createSimpleTypeTest(SchemaMetadata schema) {
TableMetadata typeTestTable = schema.create(table("ArrayTypeTest"));
typeTestTable.add(column("id").setPkey());
ColumnType[] columnTypes = new ColumnType[] { UUID_ARRAY, STRING_ARRAY, BOOL_ARRAY, INT_ARRAY, DECIMAL_ARRAY, TEXT_ARRAY, DATE_ARRAY, DATETIME_ARRAY };
for (ColumnType columnType : columnTypes) {
typeTestTable.add(column("Test_" + columnType.toString().toLowerCase()).setType(columnType).setRequired(true));
typeTestTable.add(column("Test_" + columnType.toString().toLowerCase() + "_nillable").setType(columnType));
}
}
use of com.linkedpipes.plugin.transformer.tabularuv.TabularConfig_V2.ColumnType in project molgenis-emx2 by molgenis.
the class SimpleTypeTestExample method createSimpleTypeTest.
public static void createSimpleTypeTest(SchemaMetadata schema) {
TableMetadata typeTestTable = table(TYPE_TEST).add(column("id").setPkey());
ColumnType[] columnTypes = new ColumnType[] { UUID, STRING, BOOL, INT, DECIMAL, TEXT, DATE, DATETIME };
for (ColumnType columnType : columnTypes) {
typeTestTable.add(column("Test " + columnType.toString().toLowerCase()).setType(columnType).setRequired(true));
typeTestTable.add(column("Test " + columnType.toString().toLowerCase() + " nillable").setType(columnType));
}
schema.create(typeTestTable);
}
Aggregations