Search in sources :

Example 1 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method addQuotes.

 * Adds quotes to the settings object. It counts the occurrence of double and single quotes in each line. If it's an
 * odd number it will not consider this being a quote (unless it has an odd number of escaped character of this
 * type).
 * @param settings the object to add quote settings to. Must contain file location and possibly comments - but no
 *            delimiters yet!
 * @param exec to check for cancellations and to report progress
 * @throws IOException if an I/O error occurs
 * @throws InterruptedExecutionException if analysis was interrupted
private static void addQuotes(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    assert settings != null;
    assert settings.getAllQuotes().size() == 0;
    assert settings.getDataFileLocation() != null;
    assert settings.getAllDelimiters().size() == 0;
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    double fileSize = reader.getFileSize();
    exec.setProgress("Guessing quotes");
    // add '\n' as the only delimiter, so we get one line per token
    settings.addDelimiterPattern("\n", true, false, false);
    settings.addDelimiterPattern("\r", true, false, false);
    // reconstruct original settings.
    int linesRead = 0;
    // by default we support " and ' as quotes both with escape character \
    boolean useDoubleQuotes = true;
    boolean escapeDoubleQuotes = true;
    boolean useSingleQuotes = true;
    boolean escapeSingleQuotes = true;
    String token;
    try {
        while (true) {
            token = tokenizer.nextToken();
            if (token == null) {
                // seen end of file.
            if (token.length() == 0) {
                // ignore empty lines
            // cutItShort also checks for interrupt
            if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
            if (cutItShort(exec)) {
                exec.setProgress(linesRead / (double) getShortCutLines(exec));
            } else if (fileSize > 0) {
                exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
            // Count the number of quote characters. If an odd number
            // appears don't support this quote character.
            // double quote count
            int dq = 0;
            // escaped double quotes
            int edq = 0;
            // single quote count
            int sq = 0;
            // escaped single quote count
            int esq = 0;
            boolean esc = false;
            for (int c = 0; c < token.length(); c++) {
                char ch = token.charAt(c);
                if (ch == '\\') {
                    if (esc) {
                        // it's a double backslash, leave esc mode
                        esc = false;
                    } else {
                        esc = true;
                } else {
                    if (ch == '"') {
                        if (!esc) {
                        } else {
                            // previous char was escape char.
                    if (ch == '\'') {
                        if (!esc) {
                        } else {
                    esc = false;
            // now figure out what to do...
            if (dq % 2 != 0) {
                // odd number of quotes
                if (edq % 2 != 0) {
                    // we can fix that by using the odd number of esc quotes
                    escapeDoubleQuotes = false;
                } else {
                    // nothing to do but not using double quotes as quotes
                    useDoubleQuotes = false;
                    if (!useSingleQuotes) {
                        // final decision made
            if (sq % 2 != 0) {
                // odd number of quotes
                if (esq % 2 != 0) {
                    // we can fix that by using the odd number of esc quotes
                    escapeSingleQuotes = false;
                } else {
                    // nothing to do but not using single quotes as quotes
                    useSingleQuotes = false;
                    if (!useDoubleQuotes) {
                        // final decision made
        if (useDoubleQuotes) {
            if (escapeDoubleQuotes) {
                settings.addQuotePattern("\"", "\"", '\\');
            } else {
                settings.addQuotePattern("\"", "\"");
        if (useSingleQuotes) {
            if (escapeSingleQuotes) {
                settings.addQuotePattern("'", "'", '\\');
            } else {
                settings.addQuotePattern("'", "'");
    } finally {
        // do this even if analysis is interrupted
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 2 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method createColumnTypes.

private static ColProperty[] createColumnTypes(final FileReaderNodeSettings userSettings, final FileReaderNodeSettings result, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = result.createNewInputReader();
    long fileSize = reader.getFileSize();
    exec.setProgress("Guessing column types");
    // extract user preset type - if we got any
    DataType[] userTypes = new DataType[result.getNumberOfColumns()];
    Vector<ColProperty> userColProps = userSettings.getColumnProperties();
    if (userColProps != null) {
        for (int t = 0; t < userTypes.length; t++) {
            if (t >= userColProps.size()) {
            ColProperty cProp = userColProps.get(t);
            if (cProp != null) {
                DataColumnSpec cSpec = cProp.getColumnSpec();
                if (cSpec != null) {
                    userTypes[t] = cSpec.getType();
    DataType[] types = new DataType[result.getNumberOfColumns()];
    // if we find a number that can't be parsed,
    // we set it as missing value pattern
    String[] missValPattern = new String[result.getNumberOfColumns()];
    // we can use this missing value pattern only if we also got a real
    // value for that same column
    boolean[] gotValue = new boolean[result.getNumberOfColumns()];
    for (int t = 0; t < types.length; t++) {
        // set user type - if set.
        if (userTypes[t] != null) {
            types[t] = userTypes[t];
        } else {
            types[t] = IntCell.TYPE;
        // initialize the data structures:
        missValPattern[t] = null;
        gotValue[t] = false;
    Tokenizer tokenizer = new Tokenizer(reader);
    int linesRead = 0;
    int colIdx = -1;
    // we create simple cells only, no execContext needed
    DataCellFactory cellFactory = new DataCellFactory(null);
    try {
        // close the stream on an exception
        while (true) {
            String token = tokenizer.nextToken();
            if (token == null) {
                // reached EOF
            if (result.getFileHasRowHeaders() && (colIdx == 0) && (!result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted()))) {
                // ignore the row header - get the next token/column
                token = tokenizer.nextToken();
                if (token == null) {
                    // EOF
            if (result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                // the file. But if not - what would we do...
                if (colIdx > 0) {
                    // only count not empty lines
                    exec.setProgress("Verifying column types");
                colIdx = -1;
                if (cutItShort(exec)) {
                    if (linesRead >= getShortCutLines(exec)) {
                    exec.setProgress(linesRead / (double) getShortCutLines(exec));
                } else {
                    if (fileSize > 0) {
                        exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
            if ((linesRead < 1) && (!userSettings.isFileHasColumnHeadersUserSet() || userSettings.getFileHasColumnHeaders())) {
                // unless we know it's not
            if (colIdx >= result.getNumberOfColumns()) {
                // Ignore the extra columns.
            if (userTypes[colIdx] != null) {
                // user preset type - nothing to do for us in this column
            // for numbers we trim tokens and allow empty for missValue
            token = token.trim();
            if (types[colIdx].isCompatible(IntValue.class)) {
                DataCell dc = cellFactory.createDataCellOfType(IntCell.TYPE, token);
                if (dc != null) {
                    gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
                // not an integer - could it be the missing value?
                if (missValPattern[colIdx] == null) {
                    // we accept one token that can't be
                    // parsed per column - but we don't use doubles
                    // as missing value! Would be odd.
                    dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
                    if (dc == null) {
                        missValPattern[colIdx] = token;
                // not an integer, not the missing value
                // - could be a double
                types[colIdx] = DoubleCell.TYPE;
            if (types[colIdx].isCompatible(DoubleValue.class)) {
                DataCell dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
                if (dc != null) {
                    gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
                // not a double - missing value maybe?
                if (missValPattern[colIdx] == null) {
                    // we accept one token that can't be parsed
                    // per column as missing value pattern
                    missValPattern[colIdx] = token;
                // not a double, not a missing value,
                // lets accept everything: StringCell
                types[colIdx] = StringCell.TYPE;
                gotValue[colIdx] = true;
    } finally {
    // set all columns we didn't see any real value for to String.
    // Discard any (possible) missing value pattern (that works,
    // because we don't accept doubles as missing value patterns).
    // Warn the user.
    String cols = "";
    int cnt = 0;
    for (int t = 0; t < types.length; t++) {
        if (userTypes[t] == null && !gotValue[t]) {
            // do it only for types not set by the user
            assert types[t].equals(IntCell.TYPE);
            types[t] = StringCell.TYPE;
            boolean gotOneVal = missValPattern[t] != null;
            missValPattern[t] = null;
            if ((cnt < 21) && !gotOneVal && ((userColProps == null) || (userColProps.size() <= t) || (userColProps.get(t) == null) || (!userColProps.get(t).getSkipThisColumn()))) {
                if (cnt < 20) {
                    cols += "#" + t + ", ";
                } else if (cnt == 20) {
                    cols += "...and more..., ";
    if (cols.length() > 0) {
        LOGGER.warn("Didn't get any value for column(s) with index " + // cut off the comma
        cols.substring(0, cols.length() - 2) + ". Please verify column type(s).");
    // pack column types and column missing values in one object
    ColProperty[] colPropResult = new ColProperty[types.length];
    for (int c = 0; c < colPropResult.length; c++) {
        ColProperty cp = new ColProperty();
        DataColumnSpecCreator dcsc = new DataColumnSpecCreator("Foo", types[c]);
        if (types[c].equals(StringCell.TYPE)) {
            // use the global one, if set, otherwise '?'
            if (result.getMissValuePatternStrCols() != null) {
            } else {
        } else {
            // for int or double, use the one we figured out (or none)
        colPropResult[c] = cp;
    return colPropResult;
Also used : DataColumnSpecCreator( BufferedFileReader(org.knime.base.node.util.BufferedFileReader) DataColumnSpec( DataType( DataCell( Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 3 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method testDelimiterSettingsSetColNum.

     * With the new "ignore empty tokens at end of row" option this got a bit
     * more complicated: We need to keep a range of numberOfColumns that we can
     * accept. The lower bound will be the number of non-empty columns we read
     * so far (because this is the minimum all rows must have), the maximum will
     * be the non-empty plus empty columns we have seen so far. The reason for
     * that is, we may need some of these empty tokens at the end of a row to
     * fill the row, in case a later row has more (non-empty) tokens.
private static boolean testDelimiterSettingsSetColNum(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    long fileSize = reader.getFileSize();
    int linesRead = 0;
    // column counter per line
    int columns = 0;
    // num of cols with these settings
    int numOfCols = -1;
    // num of cols incl. some empty tokens at EOR
    int maxNumOfCols = -1;
    // set it true to use these settings.
    boolean useSettings = false;
    // consecutive empty tokens read
    int consEmptyTokens = 0;
    boolean lastTokenWasDelimited = false;
    while (true) {
        if ((settings.getMaximumNumberOfRowsToRead() > -1) && (linesRead >= settings.getMaximumNumberOfRowsToRead())) {
        String token = tokenizer.nextToken();
        if (fileSize > 0) {
            exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
        if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
            lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
            // keep track of the empty tokens read.
            if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
            } else {
                consEmptyTokens = 0;
        } else {
            if (columns > 0) {
                // ignore empty lines
                try {
                    if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
                        // cutItShort also checks for interrupts
                } catch (InterruptedExecutionException iee) {
                    throw iee;
                if (token == null && lastTokenWasDelimited) {
                if (linesRead > 1) {
                    if (numOfCols < 1) {
                        // for
                        if (settings.ignoreEmptyTokensAtEndOfRow()) {
                            // these are the "hard" columns we need
                            numOfCols = columns - consEmptyTokens;
                            // we could fill up to this number with empty
                            // tokens
                            maxNumOfCols = columns;
                            if (numOfCols > 1) {
                                // if we get more than one col settings
                                // look reasonable
                                useSettings = true;
                        } else {
                            numOfCols = columns;
                            if (numOfCols <= 1) {
                                // we don't need this delimiter if we put
                                // everything in one column
                                useSettings = false;
                            useSettings = true;
                    } else {
                        if (settings.ignoreEmptyTokensAtEndOfRow()) {
                            if ((columns - consEmptyTokens) > maxNumOfCols) {
                                // we read more non-empty columns than we
                                // could
                                // fill (in other rows) with empty tokens
                                useSettings = false;
                            if (columns < numOfCols) {
                                // even with empty tokens this line has not
                                // enough columns
                                useSettings = false;
                            if (columns < maxNumOfCols) {
                                // "maxNumOfCols" is the maximum number all
                                // rows can deliver.
                                maxNumOfCols = columns;
                            if ((columns - consEmptyTokens) > numOfCols) {
                                // Adjust the number of "hard" columns
                                numOfCols = columns - consEmptyTokens;
                                if (numOfCols > 1) {
                                    useSettings = true;
                            // cols
                            assert numOfCols <= maxNumOfCols;
                        } else {
                            // cols
                            if (columns != numOfCols) {
                                // not good. Getting different number of
                                // columns in different lines.
                                useSettings = false;
            consEmptyTokens = 0;
            columns = 0;
            lastTokenWasDelimited = false;
            if (token == null) {
                // seen end of file.
    if (useSettings) {
    return useSettings;
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 4 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method getMaximumNumberOfColumns.

private static int getMaximumNumberOfColumns(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    double fileSize = reader.getFileSize();
    // non-empty lines
    int dataLinesRead = 0;
    // the counter per line
    int colCount = 0;
    // the maximum
    int numOfCols = 0;
    // consecutive empty tokens
    int consEmptyTokens = 0;
    // remember it, in case the last token in the file has no delimiter
    boolean lastTokenWasDelimited = false;
    try {
        while (true) {
            String token = tokenizer.nextToken();
            if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
                // keep track of the empty tokens read.
                if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
                } else {
                    consEmptyTokens = 0;
            } else {
                // null token (=EOF) is a row delimiter
                if (colCount > 0) {
                    // ignore empty lines
                if (token == null && colCount < numOfCols && lastTokenWasDelimited) {
                    // if the last line has no LF, EOF is delimits the last column
                if (settings.ignoreEmptyTokensAtEndOfRow()) {
                    // we are looking for the maximum - those empty tokens
                    // should not contribute to it.
                    colCount -= consEmptyTokens;
                if (colCount > numOfCols) {
                    // we are supposed to return the maximum
                    numOfCols = colCount;
                colCount = 0;
                consEmptyTokens = 0;
                if (token == null) {
                if (settings.getMaximumNumberOfRowsToRead() > -1) {
                    if (tokenizer.getLineNumber() > settings.getSkipFirstLines() + settings.getMaximumNumberOfRowsToRead()) {
                if (cutItShort(exec)) {
                    // cutItShort also checks for interrupts
                    if (dataLinesRead >= getShortCutLines(exec)) {
                    exec.setProgress(dataLinesRead / (double) getShortCutLines(exec));
                } else {
                    if (fileSize > 0) {
                        exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
    } finally {
    return numOfCols;
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 5 with BufferedFileReader

use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.

the class FileAnalyzer method checkRowHeader.

 * Looks at the first token of each line (except the first line) and returns true if they are all prefixed by the
 * same (possibly empty) string followed by a constantly incremented number.
 * @param settings the file to look at with corresponding settings
 * @return true if it's reasonable to assume the file has row headers
 * @throws IOException if an I/O error occurs
 * @throws InterruptedExecutionException if analysis should be interrupted immediately
private static boolean checkRowHeader(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    final double fileSize = reader.getFileSize();
    long linesRead = 0;
    exec.setProgress("Guessing row IDs");
    Tokenizer tokenizer = new Tokenizer(reader);
    String token;
    HeaderHelper helper = null;
    boolean firstTokenInRow = true;
    try {
        while (true) {
            token = tokenizer.nextToken();
            if (token == null) {
                // end of file
            if (firstTokenInRow && settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                // ignore empty rows
            if (firstTokenInRow) {
                firstTokenInRow = false;
                if (linesRead > 0) {
                    // we ignore the first line (could be col header line)
                    if (helper == null) {
                        // the first row ID we see
                        helper = HeaderHelper.extractPrefixAndIndexFromHeader(token);
                        if (helper == null) {
                            // that's not row header material
                            return false;
                    } else {
                        // all other header must match the first one
                        if (!helper.testNextHeader(token)) {
                            return false;
            } else {
                // swallow all tokens except new line delimiters
                if (settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                    // the next token is the first
                    firstTokenInRow = true;
                    if (cutItShort(exec)) {
                        if (linesRead > getShortCutLines(exec)) {
                        exec.setProgress(linesRead / (double) getShortCutLines(exec));
                    } else {
                        if (fileSize > 0) {
                            exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
    } finally {
    return true;
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)


BufferedFileReader (org.knime.base.node.util.BufferedFileReader)8 Tokenizer (org.knime.core.util.tokenizer.Tokenizer)5 IOException ( DataColumnSpecCreator ( URL ( HashSet (java.util.HashSet)1 NoSuchElementException (java.util.NoSuchElementException)1 DataCell ( DataColumnSpec ( DataTableSpec ( DataType ( RowKey ( DefaultRow ( StringCell ( BufferedDataContainer (org.knime.core.node.BufferedDataContainer)1 BufferedDataTable (org.knime.core.node.BufferedDataTable)1 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)1