use of org.knime.core.util.tokenizer.TokenizerException in project knime-core by knime.
the class FileRowIterator method next.
/**
* {@inheritDoc}
*/
@Override
public DataRow next() {
int rowLength = m_tableSpec.getNumColumns();
int colsToRead = m_skipColumns.length;
assert rowLength <= colsToRead;
String token = null;
boolean isMissingCell;
String rowHeader;
DataCell[] row = new DataCell[rowLength];
// lines (if we are supposed to).
if (!hasNext()) {
throw new NoSuchElementException("The row iterator proceeded beyond the last line of '" + m_frSettings.getDataFileLocation().toString() + "'.");
}
// counts the columns (tokens) read from the file
int readCols = 0;
// counts the number of columns we've created (excl. skipped columns)
int createdCols = 0;
// This will also read it from file, if supposed to.
try {
rowHeader = createRowHeader(m_rowNumber - 1);
} catch (TokenizerException fte) {
throw prepareForException(fte.getMessage() + " (line: " + m_tokenizer.getLineNumber() + " source: '" + m_frSettings.getDataFileLocation() + "')", m_tokenizer.getLineNumber(), "ERR", row);
}
// we made sure before that there is at least one token in the stream
assert rowHeader != null;
// if the last token ended with the delimiter (and not a LF)
boolean lastTokenWasDelimited = false;
// Now, read the columns until we have enough or see a row delimiter
while (readCols < colsToRead) {
try {
token = m_tokenizer.nextToken();
} catch (TokenizerException fte) {
throw prepareForException(fte.getMessage() + " (line: " + m_tokenizer.getLineNumber() + " (" + rowHeader + ") source: '" + m_frSettings.getDataFileLocation() + "')", m_tokenizer.getLineNumber(), rowHeader, row);
}
if (token != null) {
// remember the delimiter of the last token before the EOF
lastTokenWasDelimited = m_tokenizer.lastTokenWasDelimited();
}
// row delims are returned as token
if ((token == null) || m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted())) {
// line ended early.
m_tokenizer.pushBack();
// we need the row delim in the file, for after the loop
break;
}
// column delimiters).
if (token.equals("") && (!m_tokenizer.lastTokenWasQuoted())) {
isMissingCell = true;
} else if (token.equals(m_frSettings.getMissingValueOfColumn(readCols))) {
// equals(null) if it was not specified - which is fine.
isMissingCell = true;
} else {
isMissingCell = false;
}
if (!m_skipColumns[readCols]) {
DataColumnSpec cSpec = m_tableSpec.getColumnSpec(createdCols);
// now get that new cell
// (it throws an exception at us if it couldn't)
row[createdCols] = createNewDataCellOfType(cSpec.getType(), token, isMissingCell, m_frSettings.getFormatParameterForColumn(readCols).orElse(null), rowHeader, row);
createdCols++;
}
readCols++;
}
// but only if the last token was actually delimited (with a swallowed delimiter - not LF)
if (token == null && readCols == colsToRead - 1 && lastTokenWasDelimited) {
if (!m_skipColumns[readCols]) {
row[createdCols++] = DataType.getMissingCell();
}
// we consumed this last delimiter:
lastTokenWasDelimited = false;
}
int lineNr = m_tokenizer.getLineNumber();
if ((lineNr > 0) && (token != null) && (token.equals("\n"))) {
lineNr--;
}
// puke and die - unless we are told otherwise
if (m_frSettings.getSupportShortLines()) {
// pad the row with missing values
while (createdCols < rowLength) {
row[createdCols++] = DataType.getMissingCell();
}
} else {
if (createdCols < rowLength) {
FileReaderException ex = prepareForException("Too few data elements " + "(line: " + lineNr + " (" + rowHeader + "), source: '" + m_frSettings.getDataFileLocation() + "')", lineNr, rowHeader, row);
if (m_frSettings.getColumnNumDeterminingLineNumber() >= 0) {
ex.setDetailsMessage("The number of columns was " + "determined by the entries above line no." + m_frSettings.getColumnNumDeterminingLineNumber());
}
throw ex;
}
}
token = m_tokenizer.nextToken();
if (!m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted())) {
// flag for real data tokens
lastTokenWasDelimited = m_tokenizer.lastTokenWasDelimited();
}
// eat all empty tokens til the end of the row, if we're supposed to
if (m_frSettings.ignoreEmptyTokensAtEndOfRow()) {
lastTokenWasDelimited = false;
while (!m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted()) && token.equals("") && (!m_tokenizer.lastTokenWasQuoted())) {
try {
token = m_tokenizer.nextToken();
} catch (TokenizerException fte) {
throw prepareForException(fte.getMessage() + "(line: " + lineNr + " (" + rowHeader + "), source: '" + m_frSettings.getDataFileLocation() + "')", lineNr, rowHeader, row);
}
}
}
// data items in the file than we needed for one row: barf and die.
if (!m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted()) || lastTokenWasDelimited) {
FileReaderException ex = prepareForException("Too many data elements " + "(line: " + lineNr + " (" + rowHeader + "), source: '" + m_frSettings.getDataFileLocation() + "')", lineNr, rowHeader, row);
if (m_frSettings.getColumnNumDeterminingLineNumber() >= 0) {
ex.setDetailsMessage("The number of columns was " + "determined by line no." + m_frSettings.getColumnNumDeterminingLineNumber());
}
throw ex;
}
m_rowNumber++;
// report progress
// only if an execution context exists an if the underlying
// URL is a file whose size can be determined
double readBytes = m_source.getNumberOfBytesRead();
if (m_exec != null && m_source.getFileSize() > 0 && readBytes / PROGRESS_JUNK_SIZE > m_lastReport) {
// assert readBytes <= m_frSettings.getDataFileSize();
m_exec.setProgress(readBytes / m_source.getFileSize());
m_lastReport++;
}
return new DefaultRow(rowHeader, row);
}
Aggregations