Search in sources :

Example 1 with TokenizerException

use of org.knime.core.util.tokenizer.TokenizerException in project knime-core by knime.

the class FileRowIterator method next.

/**
 * {@inheritDoc}
 */
@Override
public DataRow next() {
    int rowLength = m_tableSpec.getNumColumns();
    int colsToRead = m_skipColumns.length;
    assert rowLength <= colsToRead;
    String token = null;
    boolean isMissingCell;
    String rowHeader;
    DataCell[] row = new DataCell[rowLength];
    // lines (if we are supposed to).
    if (!hasNext()) {
        throw new NoSuchElementException("The row iterator proceeded beyond the last line of '" + m_frSettings.getDataFileLocation().toString() + "'.");
    }
    // counts the columns (tokens) read from the file
    int readCols = 0;
    // counts the number of columns we've created (excl. skipped columns)
    int createdCols = 0;
    // This will also read it from file, if supposed to.
    try {
        rowHeader = createRowHeader(m_rowNumber - 1);
    } catch (TokenizerException fte) {
        throw prepareForException(fte.getMessage() + " (line: " + m_tokenizer.getLineNumber() + " source: '" + m_frSettings.getDataFileLocation() + "')", m_tokenizer.getLineNumber(), "ERR", row);
    }
    // we made sure before that there is at least one token in the stream
    assert rowHeader != null;
    // if the last token ended with the delimiter (and not a LF)
    boolean lastTokenWasDelimited = false;
    // Now, read the columns until we have enough or see a row delimiter
    while (readCols < colsToRead) {
        try {
            token = m_tokenizer.nextToken();
        } catch (TokenizerException fte) {
            throw prepareForException(fte.getMessage() + " (line: " + m_tokenizer.getLineNumber() + " (" + rowHeader + ") source: '" + m_frSettings.getDataFileLocation() + "')", m_tokenizer.getLineNumber(), rowHeader, row);
        }
        if (token != null) {
            // remember the delimiter of the last token before the EOF
            lastTokenWasDelimited = m_tokenizer.lastTokenWasDelimited();
        }
        // row delims are returned as token
        if ((token == null) || m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted())) {
            // line ended early.
            m_tokenizer.pushBack();
            // we need the row delim in the file, for after the loop
            break;
        }
        // column delimiters).
        if (token.equals("") && (!m_tokenizer.lastTokenWasQuoted())) {
            isMissingCell = true;
        } else if (token.equals(m_frSettings.getMissingValueOfColumn(readCols))) {
            // equals(null) if it was not specified - which is fine.
            isMissingCell = true;
        } else {
            isMissingCell = false;
        }
        if (!m_skipColumns[readCols]) {
            DataColumnSpec cSpec = m_tableSpec.getColumnSpec(createdCols);
            // now get that new cell
            // (it throws an exception at us if it couldn't)
            row[createdCols] = createNewDataCellOfType(cSpec.getType(), token, isMissingCell, m_frSettings.getFormatParameterForColumn(readCols).orElse(null), rowHeader, row);
            createdCols++;
        }
        readCols++;
    }
    // but only if the last token was actually delimited (with a swallowed delimiter - not LF)
    if (token == null && readCols == colsToRead - 1 && lastTokenWasDelimited) {
        if (!m_skipColumns[readCols]) {
            row[createdCols++] = DataType.getMissingCell();
        }
        // we consumed this last delimiter:
        lastTokenWasDelimited = false;
    }
    int lineNr = m_tokenizer.getLineNumber();
    if ((lineNr > 0) && (token != null) && (token.equals("\n"))) {
        lineNr--;
    }
    // puke and die - unless we are told otherwise
    if (m_frSettings.getSupportShortLines()) {
        // pad the row with missing values
        while (createdCols < rowLength) {
            row[createdCols++] = DataType.getMissingCell();
        }
    } else {
        if (createdCols < rowLength) {
            FileReaderException ex = prepareForException("Too few data elements " + "(line: " + lineNr + " (" + rowHeader + "), source: '" + m_frSettings.getDataFileLocation() + "')", lineNr, rowHeader, row);
            if (m_frSettings.getColumnNumDeterminingLineNumber() >= 0) {
                ex.setDetailsMessage("The number of columns was " + "determined by the entries above line no." + m_frSettings.getColumnNumDeterminingLineNumber());
            }
            throw ex;
        }
    }
    token = m_tokenizer.nextToken();
    if (!m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted())) {
        // flag for real data tokens
        lastTokenWasDelimited = m_tokenizer.lastTokenWasDelimited();
    }
    // eat all empty tokens til the end of the row, if we're supposed to
    if (m_frSettings.ignoreEmptyTokensAtEndOfRow()) {
        lastTokenWasDelimited = false;
        while (!m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted()) && token.equals("") && (!m_tokenizer.lastTokenWasQuoted())) {
            try {
                token = m_tokenizer.nextToken();
            } catch (TokenizerException fte) {
                throw prepareForException(fte.getMessage() + "(line: " + lineNr + " (" + rowHeader + "), source: '" + m_frSettings.getDataFileLocation() + "')", lineNr, rowHeader, row);
            }
        }
    }
    // data items in the file than we needed for one row: barf and die.
    if (!m_frSettings.isRowDelimiter(token, m_tokenizer.lastTokenWasQuoted()) || lastTokenWasDelimited) {
        FileReaderException ex = prepareForException("Too many data elements " + "(line: " + lineNr + " (" + rowHeader + "), source: '" + m_frSettings.getDataFileLocation() + "')", lineNr, rowHeader, row);
        if (m_frSettings.getColumnNumDeterminingLineNumber() >= 0) {
            ex.setDetailsMessage("The number of columns was " + "determined by line no." + m_frSettings.getColumnNumDeterminingLineNumber());
        }
        throw ex;
    }
    m_rowNumber++;
    // report progress
    // only if an execution context exists an if the underlying
    // URL is a file whose size can be determined
    double readBytes = m_source.getNumberOfBytesRead();
    if (m_exec != null && m_source.getFileSize() > 0 && readBytes / PROGRESS_JUNK_SIZE > m_lastReport) {
        // assert readBytes <= m_frSettings.getDataFileSize();
        m_exec.setProgress(readBytes / m_source.getFileSize());
        m_lastReport++;
    }
    return new DefaultRow(rowHeader, row);
}
Also used : DataColumnSpec(org.knime.core.data.DataColumnSpec) DataCell(org.knime.core.data.DataCell) TokenizerException(org.knime.core.util.tokenizer.TokenizerException) DefaultRow(org.knime.core.data.def.DefaultRow) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

NoSuchElementException (java.util.NoSuchElementException)1 DataCell (org.knime.core.data.DataCell)1 DataColumnSpec (org.knime.core.data.DataColumnSpec)1 DefaultRow (org.knime.core.data.def.DefaultRow)1 TokenizerException (org.knime.core.util.tokenizer.TokenizerException)1