Search in sources :

Example 1 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method addQuotes.

/**
 * Adds quotes to the settings object. It counts the occurrence of double and single quotes in each line. If it's an
 * odd number it will not consider this being a quote (unless it has an odd number of escaped character of this
 * type).
 *
 * @param settings the object to add quote settings to. Must contain file location and possibly comments - but no
 *            delimiters yet!
 * @param exec to check for cancellations and to report progress
 * @throws IOException if an I/O error occurs
 * @throws InterruptedExecutionException if analysis was interrupted
 */
private static void addQuotes(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    assert settings != null;
    assert settings.getAllQuotes().size() == 0;
    assert settings.getDataFileLocation() != null;
    assert settings.getAllDelimiters().size() == 0;
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    double fileSize = reader.getFileSize();
    exec.setProgress("Guessing quotes");
    // add '\n' as the only delimiter, so we get one line per token
    settings.addDelimiterPattern("\n", true, false, false);
    settings.addDelimiterPattern("\r", true, false, false);
    tokenizer.setSettings(settings);
    // reconstruct original settings.
    settings.removeAllDelimiters();
    int linesRead = 0;
    // by default we support " and ' as quotes both with escape character \
    boolean useDoubleQuotes = true;
    boolean escapeDoubleQuotes = true;
    boolean useSingleQuotes = true;
    boolean escapeSingleQuotes = true;
    String token;
    try {
        while (true) {
            token = tokenizer.nextToken();
            if (token == null) {
                // seen end of file.
                break;
            }
            if (token.length() == 0) {
                // ignore empty lines
                continue;
            }
            linesRead++;
            // cutItShort also checks for interrupt
            if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
                settings.setAnalyzeUsedAllRows(false);
                break;
            }
            if (cutItShort(exec)) {
                exec.setProgress(linesRead / (double) getShortCutLines(exec));
            } else if (fileSize > 0) {
                exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
            }
            // Count the number of quote characters. If an odd number
            // appears don't support this quote character.
            // double quote count
            int dq = 0;
            // escaped double quotes
            int edq = 0;
            // single quote count
            int sq = 0;
            // escaped single quote count
            int esq = 0;
            boolean esc = false;
            for (int c = 0; c < token.length(); c++) {
                char ch = token.charAt(c);
                if (ch == '\\') {
                    if (esc) {
                        // it's a double backslash, leave esc mode
                        esc = false;
                    } else {
                        esc = true;
                    }
                } else {
                    if (ch == '"') {
                        if (!esc) {
                            dq++;
                        } else {
                            // previous char was escape char.
                            edq++;
                        }
                    }
                    if (ch == '\'') {
                        if (!esc) {
                            sq++;
                        } else {
                            esq++;
                        }
                    }
                    esc = false;
                }
            }
            // now figure out what to do...
            if (dq % 2 != 0) {
                // odd number of quotes
                if (edq % 2 != 0) {
                    // we can fix that by using the odd number of esc quotes
                    escapeDoubleQuotes = false;
                } else {
                    // nothing to do but not using double quotes as quotes
                    useDoubleQuotes = false;
                    if (!useSingleQuotes) {
                        // final decision made
                        break;
                    }
                }
            }
            if (sq % 2 != 0) {
                // odd number of quotes
                if (esq % 2 != 0) {
                    // we can fix that by using the odd number of esc quotes
                    escapeSingleQuotes = false;
                } else {
                    // nothing to do but not using single quotes as quotes
                    useSingleQuotes = false;
                    if (!useDoubleQuotes) {
                        // final decision made
                        break;
                    }
                }
            }
        }
        if (useDoubleQuotes) {
            if (escapeDoubleQuotes) {
                settings.addQuotePattern("\"", "\"", '\\');
            } else {
                settings.addQuotePattern("\"", "\"");
            }
        }
        if (useSingleQuotes) {
            if (escapeSingleQuotes) {
                settings.addQuotePattern("'", "'", '\\');
            } else {
                settings.addQuotePattern("'", "'");
            }
        }
    } finally {
        // do this even if analysis is interrupted
        tokenizer.closeSourceStream();
    }
}
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 2 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method createColumnTypes.

private static ColProperty[] createColumnTypes(final FileReaderNodeSettings userSettings, final FileReaderNodeSettings result, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = result.createNewInputReader();
    long fileSize = reader.getFileSize();
    exec.setProgress("Guessing column types");
    // extract user preset type - if we got any
    DataType[] userTypes = new DataType[result.getNumberOfColumns()];
    Vector<ColProperty> userColProps = userSettings.getColumnProperties();
    if (userColProps != null) {
        for (int t = 0; t < userTypes.length; t++) {
            if (t >= userColProps.size()) {
                break;
            }
            ColProperty cProp = userColProps.get(t);
            if (cProp != null) {
                DataColumnSpec cSpec = cProp.getColumnSpec();
                if (cSpec != null) {
                    userTypes[t] = cSpec.getType();
                }
            }
        }
    }
    DataType[] types = new DataType[result.getNumberOfColumns()];
    // if we find a number that can't be parsed,
    // we set it as missing value pattern
    String[] missValPattern = new String[result.getNumberOfColumns()];
    // we can use this missing value pattern only if we also got a real
    // value for that same column
    boolean[] gotValue = new boolean[result.getNumberOfColumns()];
    for (int t = 0; t < types.length; t++) {
        // set user type - if set.
        if (userTypes[t] != null) {
            types[t] = userTypes[t];
        } else {
            types[t] = IntCell.TYPE;
        }
        // initialize the data structures:
        missValPattern[t] = null;
        gotValue[t] = false;
    }
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(result);
    int linesRead = 0;
    int colIdx = -1;
    // we create simple cells only, no execContext needed
    DataCellFactory cellFactory = new DataCellFactory(null);
    cellFactory.setDecimalSeparator(result.getDecimalSeparator());
    cellFactory.setThousandsSeparator(result.getThousandsSeparator());
    try {
        // close the stream on an exception
        while (true) {
            String token = tokenizer.nextToken();
            if (token == null) {
                // reached EOF
                break;
            }
            colIdx++;
            if (result.getFileHasRowHeaders() && (colIdx == 0) && (!result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted()))) {
                // ignore the row header - get the next token/column
                token = tokenizer.nextToken();
                if (token == null) {
                    // EOF
                    break;
                }
            }
            checkInterrupt(exec);
            if (result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                // the file. But if not - what would we do...
                if (colIdx > 0) {
                    // only count not empty lines
                    linesRead++;
                    exec.setProgress("Verifying column types");
                }
                colIdx = -1;
                if (cutItShort(exec)) {
                    if (linesRead >= getShortCutLines(exec)) {
                        result.setAnalyzeUsedAllRows(false);
                        break;
                    }
                    exec.setProgress(linesRead / (double) getShortCutLines(exec));
                } else {
                    if (fileSize > 0) {
                        exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
                    }
                }
                continue;
            }
            if ((linesRead < 1) && (!userSettings.isFileHasColumnHeadersUserSet() || userSettings.getFileHasColumnHeaders())) {
                // unless we know it's not
                continue;
            }
            if (colIdx >= result.getNumberOfColumns()) {
                // Ignore the extra columns.
                continue;
            }
            if (userTypes[colIdx] != null) {
                // user preset type - nothing to do for us in this column
                continue;
            }
            cellFactory.setMissingValuePattern(missValPattern[colIdx]);
            // for numbers we trim tokens and allow empty for missValue
            token = token.trim();
            if (types[colIdx].isCompatible(IntValue.class)) {
                DataCell dc = cellFactory.createDataCellOfType(IntCell.TYPE, token);
                if (dc != null) {
                    gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
                    continue;
                }
                // not an integer - could it be the missing value?
                if (missValPattern[colIdx] == null) {
                    // we accept one token that can't be
                    // parsed per column - but we don't use doubles
                    // as missing value! Would be odd.
                    dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
                    if (dc == null) {
                        missValPattern[colIdx] = token;
                        continue;
                    }
                }
                // not an integer, not the missing value
                // - could be a double
                types[colIdx] = DoubleCell.TYPE;
            }
            if (types[colIdx].isCompatible(DoubleValue.class)) {
                DataCell dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
                if (dc != null) {
                    gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
                    continue;
                }
                // not a double - missing value maybe?
                if (missValPattern[colIdx] == null) {
                    // we accept one token that can't be parsed
                    // per column as missing value pattern
                    missValPattern[colIdx] = token;
                    continue;
                }
                // not a double, not a missing value,
                // lets accept everything: StringCell
                types[colIdx] = StringCell.TYPE;
                gotValue[colIdx] = true;
            }
        }
    } finally {
        tokenizer.closeSourceStream();
    }
    // set all columns we didn't see any real value for to String.
    // Discard any (possible) missing value pattern (that works,
    // because we don't accept doubles as missing value patterns).
    // Warn the user.
    String cols = "";
    int cnt = 0;
    for (int t = 0; t < types.length; t++) {
        if (userTypes[t] == null && !gotValue[t]) {
            // do it only for types not set by the user
            assert types[t].equals(IntCell.TYPE);
            types[t] = StringCell.TYPE;
            boolean gotOneVal = missValPattern[t] != null;
            missValPattern[t] = null;
            if ((cnt < 21) && !gotOneVal && ((userColProps == null) || (userColProps.size() <= t) || (userColProps.get(t) == null) || (!userColProps.get(t).getSkipThisColumn()))) {
                if (cnt < 20) {
                    cols += "#" + t + ", ";
                    cnt++;
                } else if (cnt == 20) {
                    cols += "...and more..., ";
                    cnt++;
                }
            }
        }
    }
    if (cols.length() > 0) {
        LOGGER.warn("Didn't get any value for column(s) with index " + // cut off the comma
        cols.substring(0, cols.length() - 2) + ". Please verify column type(s).");
    }
    // pack column types and column missing values in one object
    ColProperty[] colPropResult = new ColProperty[types.length];
    for (int c = 0; c < colPropResult.length; c++) {
        ColProperty cp = new ColProperty();
        DataColumnSpecCreator dcsc = new DataColumnSpecCreator("Foo", types[c]);
        cp.setColumnSpec(dcsc.createSpec());
        if (types[c].equals(StringCell.TYPE)) {
            // use the global one, if set, otherwise '?'
            if (result.getMissValuePatternStrCols() != null) {
                cp.setMissingValuePattern(result.getMissValuePatternStrCols());
            } else {
                cp.setMissingValuePattern("?");
            }
        } else {
            // for int or double, use the one we figured out (or none)
            cp.setMissingValuePattern(missValPattern[c]);
        }
        colPropResult[c] = cp;
    }
    return colPropResult;
}
Also used : DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) BufferedFileReader(org.knime.base.node.util.BufferedFileReader) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataType(org.knime.core.data.DataType) DataCell(org.knime.core.data.DataCell) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 3 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method testDelimiterSettingsSetColNum.

/*
     * With the new "ignore empty tokens at end of row" option this got a bit
     * more complicated: We need to keep a range of numberOfColumns that we can
     * accept. The lower bound will be the number of non-empty columns we read
     * so far (because this is the minimum all rows must have), the maximum will
     * be the non-empty plus empty columns we have seen so far. The reason for
     * that is, we may need some of these empty tokens at the end of a row to
     * fill the row, in case a later row has more (non-empty) tokens.
     */
private static boolean testDelimiterSettingsSetColNum(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(settings);
    long fileSize = reader.getFileSize();
    int linesRead = 0;
    // column counter per line
    int columns = 0;
    // num of cols with these settings
    int numOfCols = -1;
    // num of cols incl. some empty tokens at EOR
    int maxNumOfCols = -1;
    // set it true to use these settings.
    boolean useSettings = false;
    // consecutive empty tokens read
    int consEmptyTokens = 0;
    boolean lastTokenWasDelimited = false;
    while (true) {
        if ((settings.getMaximumNumberOfRowsToRead() > -1) && (linesRead >= settings.getMaximumNumberOfRowsToRead())) {
            break;
        }
        String token = tokenizer.nextToken();
        if (fileSize > 0) {
            exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
        }
        if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
            columns++;
            lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
            // keep track of the empty tokens read.
            if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
                consEmptyTokens++;
            } else {
                consEmptyTokens = 0;
            }
        } else {
            if (columns > 0) {
                // ignore empty lines
                linesRead++;
                try {
                    if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
                        // cutItShort also checks for interrupts
                        settings.setAnalyzeUsedAllRows(false);
                        break;
                    }
                } catch (InterruptedExecutionException iee) {
                    tokenizer.closeSourceStream();
                    throw iee;
                }
                if (token == null && lastTokenWasDelimited) {
                    columns++;
                }
                if (linesRead > 1) {
                    if (numOfCols < 1) {
                        // for
                        if (settings.ignoreEmptyTokensAtEndOfRow()) {
                            // these are the "hard" columns we need
                            numOfCols = columns - consEmptyTokens;
                            // we could fill up to this number with empty
                            // tokens
                            maxNumOfCols = columns;
                            if (numOfCols > 1) {
                                // if we get more than one col settings
                                // look reasonable
                                useSettings = true;
                            }
                        } else {
                            numOfCols = columns;
                            if (numOfCols <= 1) {
                                // we don't need this delimiter if we put
                                // everything in one column
                                useSettings = false;
                                break;
                            }
                            useSettings = true;
                        }
                    } else {
                        if (settings.ignoreEmptyTokensAtEndOfRow()) {
                            if ((columns - consEmptyTokens) > maxNumOfCols) {
                                // we read more non-empty columns than we
                                // could
                                // fill (in other rows) with empty tokens
                                useSettings = false;
                                break;
                            }
                            if (columns < numOfCols) {
                                // even with empty tokens this line has not
                                // enough columns
                                useSettings = false;
                                break;
                            }
                            if (columns < maxNumOfCols) {
                                // "maxNumOfCols" is the maximum number all
                                // rows can deliver.
                                maxNumOfCols = columns;
                            }
                            if ((columns - consEmptyTokens) > numOfCols) {
                                // Adjust the number of "hard" columns
                                numOfCols = columns - consEmptyTokens;
                                if (numOfCols > 1) {
                                    useSettings = true;
                                }
                            }
                            // cols
                            assert numOfCols <= maxNumOfCols;
                        } else {
                            // cols
                            if (columns != numOfCols) {
                                // not good. Getting different number of
                                // columns in different lines.
                                useSettings = false;
                                break;
                            }
                        }
                    }
                }
            }
            consEmptyTokens = 0;
            columns = 0;
            lastTokenWasDelimited = false;
            if (token == null) {
                // seen end of file.
                break;
            }
        }
    }
    tokenizer.closeSourceStream();
    if (useSettings) {
        settings.setNumberOfColumns(numOfCols);
    }
    return useSettings;
}
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 4 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method getMaximumNumberOfColumns.

private static int getMaximumNumberOfColumns(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(settings);
    double fileSize = reader.getFileSize();
    // non-empty lines
    int dataLinesRead = 0;
    // the counter per line
    int colCount = 0;
    // the maximum
    int numOfCols = 0;
    // consecutive empty tokens
    int consEmptyTokens = 0;
    // remember it, in case the last token in the file has no delimiter
    boolean lastTokenWasDelimited = false;
    try {
        while (true) {
            String token = tokenizer.nextToken();
            if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                colCount++;
                lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
                // keep track of the empty tokens read.
                if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
                    consEmptyTokens++;
                } else {
                    consEmptyTokens = 0;
                }
            } else {
                // null token (=EOF) is a row delimiter
                if (colCount > 0) {
                    // ignore empty lines
                    dataLinesRead++;
                }
                if (token == null && colCount < numOfCols && lastTokenWasDelimited) {
                    // if the last line has no LF, EOF is delimits the last column
                    colCount++;
                }
                if (settings.ignoreEmptyTokensAtEndOfRow()) {
                    // we are looking for the maximum - those empty tokens
                    // should not contribute to it.
                    colCount -= consEmptyTokens;
                }
                if (colCount > numOfCols) {
                    // we are supposed to return the maximum
                    numOfCols = colCount;
                    settings.setColumnNumDeterminingLineNumber(tokenizer.getLineNumber());
                }
                colCount = 0;
                consEmptyTokens = 0;
                if (token == null) {
                    break;
                }
                if (settings.getMaximumNumberOfRowsToRead() > -1) {
                    if (tokenizer.getLineNumber() > settings.getSkipFirstLines() + settings.getMaximumNumberOfRowsToRead()) {
                        break;
                    }
                }
                if (cutItShort(exec)) {
                    // cutItShort also checks for interrupts
                    if (dataLinesRead >= getShortCutLines(exec)) {
                        settings.setAnalyzeUsedAllRows(false);
                        break;
                    }
                    exec.setProgress(dataLinesRead / (double) getShortCutLines(exec));
                } else {
                    if (fileSize > 0) {
                        exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
                    }
                }
            }
        }
    } finally {
        tokenizer.closeSourceStream();
    }
    return numOfCols;
}
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 5 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method checkRowHeader.

/**
 * Looks at the first token of each line (except the first line) and returns true if they are all prefixed by the
 * same (possibly empty) string followed by a constantly incremented number.
 *
 * @param settings the file to look at with corresponding settings
 * @return true if it's reasonable to assume the file has row headers
 * @throws IOException if an I/O error occurs
 * @throws InterruptedExecutionException if analysis should be interrupted immediately
 */
private static boolean checkRowHeader(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    final double fileSize = reader.getFileSize();
    long linesRead = 0;
    exec.setProgress("Guessing row IDs");
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(settings);
    String token;
    HeaderHelper helper = null;
    boolean firstTokenInRow = true;
    try {
        while (true) {
            token = tokenizer.nextToken();
            if (token == null) {
                // end of file
                break;
            }
            if (firstTokenInRow && settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                // ignore empty rows
                continue;
            }
            if (firstTokenInRow) {
                firstTokenInRow = false;
                if (linesRead > 0) {
                    // we ignore the first line (could be col header line)
                    if (helper == null) {
                        // the first row ID we see
                        helper = HeaderHelper.extractPrefixAndIndexFromHeader(token);
                        if (helper == null) {
                            // that's not row header material
                            return false;
                        }
                    } else {
                        // all other header must match the first one
                        if (!helper.testNextHeader(token)) {
                            return false;
                        }
                    }
                }
            } else {
                // swallow all tokens except new line delimiters
                if (settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                    // the next token is the first
                    firstTokenInRow = true;
                    linesRead++;
                    if (cutItShort(exec)) {
                        if (linesRead > getShortCutLines(exec)) {
                            break;
                        }
                        exec.setProgress(linesRead / (double) getShortCutLines(exec));
                    } else {
                        if (fileSize > 0) {
                            exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
                        }
                    }
                }
            }
        }
    } finally {
        tokenizer.closeSourceStream();
    }
    return true;
}
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Aggregations

BufferedFileReader (org.knime.base.node.util.BufferedFileReader)8 Tokenizer (org.knime.core.util.tokenizer.Tokenizer)5 IOException (java.io.IOException)3 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)2 URL (java.net.URL)1 HashSet (java.util.HashSet)1 NoSuchElementException (java.util.NoSuchElementException)1 DataCell (org.knime.core.data.DataCell)1 DataColumnSpec (org.knime.core.data.DataColumnSpec)1 DataTableSpec (org.knime.core.data.DataTableSpec)1 DataType (org.knime.core.data.DataType)1 RowKey (org.knime.core.data.RowKey)1 DefaultRow (org.knime.core.data.def.DefaultRow)1 StringCell (org.knime.core.data.def.StringCell)1 BufferedDataContainer (org.knime.core.node.BufferedDataContainer)1 BufferedDataTable (org.knime.core.node.BufferedDataTable)1 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)1