Search in sources :

Example 1 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class CellSplitterCellFactory method createNewColumnTypes.

/**
 * Analyzes the values in the user selected column and tries to figure out
 * how many columns are needed to hold the splitted values and of which type
 * the new resulting column have to be. <br>
 * If the "output as list" or "output as set" flag IS set in the settings
 * object it returns one as column number, since only one collection cell
 * is needed to store the output.
 * If the "guess" flag in the settings object is NOT set, it returns the
 * column number entered by the user and string type for all columns.
 * Otherwise it runs once through the entire table, splits the value of the
 * selected column, stores the maximum number of parts received, and tries
 * to convert each part into an int (first), then into a double, and if both
 * fails it sets string type for the corresponding column.
 *
 * @param table the table with the column to examine (can be null, if no
 *            type guessing is required)
 * @param userSettings user settings
 * @param exec the execution context to set progress and check for cancel
 *            (can be null)
 * @return a settings object containing the same settings as the ones passed
 *         in and in addition the type (and number) of each column to add
 * @throws CanceledExecutionException if user cancels
 */
static CellSplitterSettings createNewColumnTypes(final BufferedDataTable table, final CellSplitterUserSettings userSettings, final ExecutionContext exec) throws CanceledExecutionException {
    // make sure we have settings we can deal with
    DataTableSpec spec = null;
    if (table != null) {
        spec = table.getDataTableSpec();
    }
    String msg = userSettings.getStatus(spec);
    if (msg != null) {
        // don't call this with invalid settings
        assert false;
        throw new IllegalStateException(msg);
    }
    // transfer the user settings into a new settings object (the result)
    CellSplitterSettings result;
    NodeSettings tmp = new NodeSettings("tmp");
    userSettings.saveSettingsTo(tmp);
    try {
        result = new CellSplitterSettings(tmp);
    } catch (InvalidSettingsException ise) {
        // the getStatus should have covered any invalidities
        throw new IllegalStateException(ise.getMessage());
    }
    /*
         * not guessing types: output as columns
         */
    if (!userSettings.isGuessNumOfCols() && userSettings.isOutputAsCols()) {
        // we are not supposed to analyze the file.
        for (int col = 0; col < userSettings.getNumOfCols(); col++) {
            // create as many string columns as the user set
            result.addColumnOfType(StringCell.TYPE);
        }
        return result;
    }
    /*
         * not guessing types: output as list or set
         */
    if (userSettings.isOutputAsList() || userSettings.isOutputAsSet()) {
        DataType colType = null;
        // list cell type
        if (userSettings.isOutputAsList()) {
            colType = ListCell.getCollectionType(StringCell.TYPE);
        // set cell type otherwise (there is no other option left)
        } else {
            colType = SetCell.getCollectionType(StringCell.TYPE);
        }
        result.addColumnOfType(colType);
        return result;
    }
    /*
         * analyze table
         */
    int colIdx = table.getDataTableSpec().findColumnIndex(userSettings.getColumnName());
    if (colIdx < 0) {
        // the status should have checked this!
        assert false;
        throw new IllegalStateException("Input table doesn't contain selected column");
    }
    TokenizerSettings tokenizerSettings = createTokenizerSettings(userSettings);
    if (tokenizerSettings == null) {
        throw new IllegalStateException("Incorrect user settings");
    }
    long rowCnt = 0;
    long numOfRows = table.size();
    for (DataRow row : table) {
        rowCnt++;
        String inputString = "";
        DataCell inputCell = row.getCell(colIdx);
        if (inputCell.isMissing()) {
            // missing cells don't help determining the target types
            continue;
        }
        if (inputCell instanceof StringValue) {
            inputString = ((StringValue) inputCell).getStringValue();
        } else {
            inputString = inputCell.toString();
        }
        // init the tokenizer
        StringReader inputReader = new StringReader(inputString);
        // the reader is no good if it doesn't support the mark operation
        assert inputReader.markSupported();
        Tokenizer tokenizer = new Tokenizer(inputReader);
        tokenizer.setSettings(tokenizerSettings);
        int addedColIdx = -1;
        // read tokens from the input, analyze the tokens and set the type
        while (true) {
            String token = tokenizer.nextToken();
            addedColIdx++;
            if (token == null) {
                // done with that input string from that row
                break;
            }
            token = token.trim();
            DataType colType = IntCell.TYPE;
            // if we already got that many columns, verify the type
            if (addedColIdx < result.getNumOfColsGuessed()) {
                colType = result.getTypeOfColumn(addedColIdx);
            } else {
                // otherwise init the type with int
                result.addColumnOfType(colType);
            }
            if (colType.equals(IntCell.TYPE)) {
                // try converting it to an integer
                try {
                    Integer.parseInt(token);
                } catch (NumberFormatException nfe) {
                    // that wasn't really an integer. Try double.
                    colType = DoubleCell.TYPE;
                }
            }
            if (colType.equals(DoubleCell.TYPE)) {
                // try converting it to a double
                try {
                    Double.parseDouble(token);
                } catch (NumberFormatException nfe) {
                    // that wasn't really a double. Use string.
                    colType = StringCell.TYPE;
                }
            }
            // write back the type
            result.replaceTypeOfColumn(addedColIdx, colType);
        }
        if (exec != null) {
            exec.checkCanceled();
            exec.setProgress((double) rowCnt / (double) numOfRows, "Analyzing row #" + rowCnt + " of " + numOfRows);
        }
    }
    /*
         * if the input table contained missing values only, we end up with no
         * column to add. Throw an exception.
         */
    if (result.getNumOfColsGuessed() < 1) {
        throw new IllegalStateException("Data analysis computed no " + "columns to add (happens if input table is empty or " + "has only missing values).\n" + "Please set the array size manually.");
    }
    return result;
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataRow(org.knime.core.data.DataRow) NodeSettings(org.knime.core.node.NodeSettings) TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) StringReader(java.io.StringReader) DataType(org.knime.core.data.DataType) DataCell(org.knime.core.data.DataCell) StringValue(org.knime.core.data.StringValue) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 2 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class CellSplitterCellFactory method tokenizeAndCreateCollectionsCell.

/**
 * Tokenizes the string representation of the given data cell and returns
 * an array of data cells. The array contains only one data cell, which
 * is a collection cell. Whether it is a List or Set cell is specified in
 * the settings. The collection cell contains string cells. For each
 * token one string cell is created.
 * @param inputCell the cell to tokenize (its string representation)
 * @return An array containing exactly one collection cell, storing string
 * cells. For each token one string cell.
 * @since 2.6
 */
private DataCell[] tokenizeAndCreateCollectionsCell(final DataCell inputCell) {
    DataCell[] result = new DataCell[1];
    // missing value handling
    if (inputCell.isMissing()) {
        Arrays.fill(result, DataType.getMissingCell());
        if (m_settings.isUseEmptyString()) {
            Collection<DataCell> strColl = new ArrayList<DataCell>(1);
            strColl.add(EMPTY_STRINGCELL);
            result[0] = CollectionCellFactory.createListCell(strColl);
        }
        return result;
    }
    final String inputString = getInputString(inputCell);
    // init the tokenizer
    StringReader inputReader = new StringReader(inputString);
    Tokenizer tokenizer = prepareTokenizer(inputReader);
    Collection<DataCell> strColl = new ArrayList<DataCell>();
    String token = null;
    while ((token = tokenizer.nextToken()) != null) {
        if (m_settings.isTrim()) {
            token = token.trim();
        }
        strColl.add(new StringCell(token));
    }
    if (m_settings.isOutputAsList()) {
        result[0] = CollectionCellFactory.createListCell(strColl);
    } else {
        result[0] = CollectionCellFactory.createSetCell(strColl);
    }
    return result;
}
Also used : StringCell(org.knime.core.data.def.StringCell) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) DataCell(org.knime.core.data.DataCell) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 3 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class FileAnalyzer method addQuotes.

/**
 * Adds quotes to the settings object. It counts the occurrence of double and single quotes in each line. If it's an
 * odd number it will not consider this being a quote (unless it has an odd number of escaped character of this
 * type).
 *
 * @param settings the object to add quote settings to. Must contain file location and possibly comments - but no
 *            delimiters yet!
 * @param exec to check for cancellations and to report progress
 * @throws IOException if an I/O error occurs
 * @throws InterruptedExecutionException if analysis was interrupted
 */
private static void addQuotes(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    assert settings != null;
    assert settings.getAllQuotes().size() == 0;
    assert settings.getDataFileLocation() != null;
    assert settings.getAllDelimiters().size() == 0;
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    double fileSize = reader.getFileSize();
    exec.setProgress("Guessing quotes");
    // add '\n' as the only delimiter, so we get one line per token
    settings.addDelimiterPattern("\n", true, false, false);
    settings.addDelimiterPattern("\r", true, false, false);
    tokenizer.setSettings(settings);
    // reconstruct original settings.
    settings.removeAllDelimiters();
    int linesRead = 0;
    // by default we support " and ' as quotes both with escape character \
    boolean useDoubleQuotes = true;
    boolean escapeDoubleQuotes = true;
    boolean useSingleQuotes = true;
    boolean escapeSingleQuotes = true;
    String token;
    try {
        while (true) {
            token = tokenizer.nextToken();
            if (token == null) {
                // seen end of file.
                break;
            }
            if (token.length() == 0) {
                // ignore empty lines
                continue;
            }
            linesRead++;
            // cutItShort also checks for interrupt
            if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
                settings.setAnalyzeUsedAllRows(false);
                break;
            }
            if (cutItShort(exec)) {
                exec.setProgress(linesRead / (double) getShortCutLines(exec));
            } else if (fileSize > 0) {
                exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
            }
            // Count the number of quote characters. If an odd number
            // appears don't support this quote character.
            // double quote count
            int dq = 0;
            // escaped double quotes
            int edq = 0;
            // single quote count
            int sq = 0;
            // escaped single quote count
            int esq = 0;
            boolean esc = false;
            for (int c = 0; c < token.length(); c++) {
                char ch = token.charAt(c);
                if (ch == '\\') {
                    if (esc) {
                        // it's a double backslash, leave esc mode
                        esc = false;
                    } else {
                        esc = true;
                    }
                } else {
                    if (ch == '"') {
                        if (!esc) {
                            dq++;
                        } else {
                            // previous char was escape char.
                            edq++;
                        }
                    }
                    if (ch == '\'') {
                        if (!esc) {
                            sq++;
                        } else {
                            esq++;
                        }
                    }
                    esc = false;
                }
            }
            // now figure out what to do...
            if (dq % 2 != 0) {
                // odd number of quotes
                if (edq % 2 != 0) {
                    // we can fix that by using the odd number of esc quotes
                    escapeDoubleQuotes = false;
                } else {
                    // nothing to do but not using double quotes as quotes
                    useDoubleQuotes = false;
                    if (!useSingleQuotes) {
                        // final decision made
                        break;
                    }
                }
            }
            if (sq % 2 != 0) {
                // odd number of quotes
                if (esq % 2 != 0) {
                    // we can fix that by using the odd number of esc quotes
                    escapeSingleQuotes = false;
                } else {
                    // nothing to do but not using single quotes as quotes
                    useSingleQuotes = false;
                    if (!useDoubleQuotes) {
                        // final decision made
                        break;
                    }
                }
            }
        }
        if (useDoubleQuotes) {
            if (escapeDoubleQuotes) {
                settings.addQuotePattern("\"", "\"", '\\');
            } else {
                settings.addQuotePattern("\"", "\"");
            }
        }
        if (useSingleQuotes) {
            if (escapeSingleQuotes) {
                settings.addQuotePattern("'", "'", '\\');
            } else {
                settings.addQuotePattern("'", "'");
            }
        }
    } finally {
        // do this even if analysis is interrupted
        tokenizer.closeSourceStream();
    }
}
Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 4 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class FileAnalyzer method createColumnProperties.

/**
 * Determines the type and name of each column. It tries to figure out if there are column headers in the file or
 * otherwise generates names for the columns. <br>
 * We read from the first line one token per column (plus one for the row header if we have row headers in the
 * file). Then we do three checks: first, if we have row headers and are missing one token we assume the column
 * header for the "row-header-column" is missing, thus we must have column headers. Second, we check the types of
 * the tokens read. If one of the tokens (except the first if we have row headers) cannot be converted into the
 * column's type, we assume its a column header. Last, if all tokens (except the first if we have row headers) start
 * with the same prefix followed by an increasing number, then that looks like column headers to us. Otherwise we
 * say we have no column headers.
 *
 * @param userSettings settings user provided. Must be honored!
 * @param result the settings so far, must contain data url, delimiters, comments, quotes, colNumber, and rowHeader
 *            flag
 * @param exec to check for cancellations and report progress to
 * @return a vector of colProperty objects, having the columnSpec set and the useFileHeader flag
 * @throws IOException if an I/O error occurs
 */
private static Vector<ColProperty> createColumnProperties(final FileReaderNodeSettings userSettings, final FileReaderNodeSettings result, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    // first detect the type of each column
    ExecutionMonitor subExec = exec.createSubProgress(TYPES_SUB);
    ColProperty[] colProps = createColumnTypes(userSettings, result, subExec);
    // extract the column types and column missing values from the result
    // of the above method call
    DataType[] columnTypes = new DataType[colProps.length];
    String[] missValues = new String[colProps.length];
    String[] formatParameters = new String[colProps.length];
    for (int c = 0; c < colProps.length; c++) {
        columnTypes[c] = colProps[c].getColumnSpec().getType();
        missValues[c] = colProps[c].getMissingValuePattern();
        formatParameters[c] = colProps[c].getFormatParameter().orElse(null);
    }
    subExec.setProgress(1.0);
    checkInterrupt(exec);
    // number of columns must be set accordingly (including skipped cols)
    assert result.getNumberOfColumns() == columnTypes.length;
    // store the first line here to analyze the tokens - depending on the
    // row header flag expect one more token to come.
    String rowHeader = null;
    String scndLineRowHeader = null;
    String[] columnHeaders = new String[result.getNumberOfColumns()];
    BufferedReader reader = result.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(result);
    exec.setProgress("Guessing column headers");
    // the first token is supposed to be the header for the "row column"
    if (result.getFileHasRowHeaders()) {
        rowHeader = tokenizer.nextToken();
    }
    // now read the (possible) data column headers
    for (int c = 0; c < columnHeaders.length; c++) {
        String token = tokenizer.nextToken();
        if (token == null) {
            // end of file... already?!?
            break;
        }
        if (result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
            // end of line - a bit early, huh??
            scndLineRowHeader = tokenizer.nextToken();
            break;
        }
        columnHeaders[c] = token;
        try {
            checkInterrupt(exec);
        } catch (InterruptedExecutionException iee) {
            tokenizer.closeSourceStream();
            throw iee;
        }
    }
    // the next token is the row header in the next row (could be...)
    scndLineRowHeader = tokenizer.nextToken();
    tokenizer.closeSourceStream();
    Vector<ColProperty> userColProps = userSettings.getColumnProperties();
    if (userColProps == null) {
        // that saves us quite some checking later
        userColProps = new Vector<ColProperty>();
    }
    if (!userSettings.isFileHasColumnHeadersUserSet()) {
        // headers, we assume the rowHeader is a data column header.
        if (result.getFileHasRowHeaders() && // && (the last token is empty)
        (columnHeaders.length > 0) && (columnHeaders[columnHeaders.length - 1] == null)) {
            result.setFileHasColumnHeaders(true);
            // discard the last (=null) token
            String[] colNames = new String[result.getNumberOfColumns()];
            colNames[0] = rowHeader;
            System.arraycopy(columnHeaders, 0, colNames, 1, colNames.length - 1);
            return createColProps(colNames, userColProps, columnTypes, missValues, formatParameters, exec);
        }
        // another indication for a column_headers_must_have is when the
        // first line contains tokens that are not type compliant with all
        // other lines (e.g. all items in the column are integers except in
        // the first line).
        // we create simple cells only
        DataCellFactory cellFactory = new DataCellFactory(null);
        cellFactory.setDecimalSeparator(result.getDecimalSeparator());
        cellFactory.setThousandsSeparator(result.getThousandsSeparator());
        for (int c = 0; c < columnHeaders.length; c++) {
            checkInterrupt(exec);
            if (columnHeaders[c] == null) {
                // the first line ended early - could be anything...
                continue;
            }
            cellFactory.setMissingValuePattern(missValues[c]);
            cellFactory.setFormatParameter(formatParameters[c]);
            DataCell dc = cellFactory.createDataCellOfType(columnTypes[c], columnHeaders[c]);
            if (dc != null) {
                // this column header could be data - try the others...
                continue;
            }
            // header is not data: must be column header
            result.setFileHasColumnHeaders(true);
            return createColProps(columnHeaders, userColProps, columnTypes, missValues, formatParameters, exec);
        }
        // should also fit in - if we have row headers in the file
        if (!result.isFileHasRowHeadersUserSet()) {
            // prefix+index pattern, so we have nothing to test against.
            if (rowHeader != null && scndLineRowHeader != null) {
                HeaderHelper hh = HeaderHelper.extractPrefixAndIndexFromHeader(rowHeader);
                if (hh == null || !hh.testNextHeader(scndLineRowHeader)) {
                    // this first line row header isn't a good row header
                    // all the other lines have nice ones - create col hdrs
                    // also create colHdrs if they don't fit to each other
                    // header is not data: must be column header
                    result.setFileHasColumnHeaders(true);
                    return createColProps(columnHeaders, userColProps, columnTypes, missValues, formatParameters, exec);
                }
            }
        }
        // all have the same prefix and a growing index.
        if ((columnHeaders.length > 0) && consecutiveHeaders(columnHeaders, exec)) {
            result.setFileHasColumnHeaders(true);
            return createColProps(columnHeaders, userColProps, columnTypes, missValues, formatParameters, exec);
        }
        // otherwise we assume the first line doesn't contain headers.
        // pass an array with null strings and it will create headers for us
        result.setFileHasColumnHeaders(false);
        // null array
        String[] nulls = new String[columnHeaders.length];
        return createColProps(nulls, userColProps, columnTypes, missValues, formatParameters, exec);
    } else {
        // user set fileHasColHeaders - see if it's true or false
        result.setFileHasColumnHeaders(userSettings.getFileHasColumnHeaders());
        result.setFileHasColumnHeadersUserSet(true);
        if (userSettings.getFileHasColumnHeaders()) {
            // use the headers we read in
            if ((columnHeaders.length > 0) && (columnHeaders[columnHeaders.length - 1] == null) && rowHeader != null) {
                // okay, we got one too few, use row header
                String[] colNames = new String[result.getNumberOfColumns()];
                colNames[0] = rowHeader;
                System.arraycopy(columnHeaders, 0, colNames, 1, colNames.length - 1);
                return createColProps(colNames, userColProps, columnTypes, missValues, formatParameters, exec);
            } else {
                return createColProps(columnHeaders, userColProps, columnTypes, missValues, formatParameters, exec);
            }
        } else {
            // don't read col headers - create null array to generate names
            String[] colNames = new String[columnHeaders.length];
            return createColProps(colNames, userColProps, columnTypes, missValues, formatParameters, exec);
        }
    }
}
Also used : BufferedReader(java.io.BufferedReader) DataType(org.knime.core.data.DataType) DataCell(org.knime.core.data.DataCell) ExecutionMonitor(org.knime.core.node.ExecutionMonitor) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 5 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class FileAnalyzer method createColumnTypes.

private static ColProperty[] createColumnTypes(final FileReaderNodeSettings userSettings, final FileReaderNodeSettings result, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = result.createNewInputReader();
    long fileSize = reader.getFileSize();
    exec.setProgress("Guessing column types");
    // extract user preset type - if we got any
    DataType[] userTypes = new DataType[result.getNumberOfColumns()];
    Vector<ColProperty> userColProps = userSettings.getColumnProperties();
    if (userColProps != null) {
        for (int t = 0; t < userTypes.length; t++) {
            if (t >= userColProps.size()) {
                break;
            }
            ColProperty cProp = userColProps.get(t);
            if (cProp != null) {
                DataColumnSpec cSpec = cProp.getColumnSpec();
                if (cSpec != null) {
                    userTypes[t] = cSpec.getType();
                }
            }
        }
    }
    DataType[] types = new DataType[result.getNumberOfColumns()];
    // if we find a number that can't be parsed,
    // we set it as missing value pattern
    String[] missValPattern = new String[result.getNumberOfColumns()];
    // we can use this missing value pattern only if we also got a real
    // value for that same column
    boolean[] gotValue = new boolean[result.getNumberOfColumns()];
    for (int t = 0; t < types.length; t++) {
        // set user type - if set.
        if (userTypes[t] != null) {
            types[t] = userTypes[t];
        } else {
            types[t] = IntCell.TYPE;
        }
        // initialize the data structures:
        missValPattern[t] = null;
        gotValue[t] = false;
    }
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(result);
    int linesRead = 0;
    int colIdx = -1;
    // we create simple cells only, no execContext needed
    DataCellFactory cellFactory = new DataCellFactory(null);
    cellFactory.setDecimalSeparator(result.getDecimalSeparator());
    cellFactory.setThousandsSeparator(result.getThousandsSeparator());
    try {
        // close the stream on an exception
        while (true) {
            String token = tokenizer.nextToken();
            if (token == null) {
                // reached EOF
                break;
            }
            colIdx++;
            if (result.getFileHasRowHeaders() && (colIdx == 0) && (!result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted()))) {
                // ignore the row header - get the next token/column
                token = tokenizer.nextToken();
                if (token == null) {
                    // EOF
                    break;
                }
            }
            checkInterrupt(exec);
            if (result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                // the file. But if not - what would we do...
                if (colIdx > 0) {
                    // only count not empty lines
                    linesRead++;
                    exec.setProgress("Verifying column types");
                }
                colIdx = -1;
                if (cutItShort(exec)) {
                    if (linesRead >= getShortCutLines(exec)) {
                        result.setAnalyzeUsedAllRows(false);
                        break;
                    }
                    exec.setProgress(linesRead / (double) getShortCutLines(exec));
                } else {
                    if (fileSize > 0) {
                        exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
                    }
                }
                continue;
            }
            if ((linesRead < 1) && (!userSettings.isFileHasColumnHeadersUserSet() || userSettings.getFileHasColumnHeaders())) {
                // unless we know it's not
                continue;
            }
            if (colIdx >= result.getNumberOfColumns()) {
                // Ignore the extra columns.
                continue;
            }
            if (userTypes[colIdx] != null) {
                // user preset type - nothing to do for us in this column
                continue;
            }
            cellFactory.setMissingValuePattern(missValPattern[colIdx]);
            // for numbers we trim tokens and allow empty for missValue
            token = token.trim();
            if (types[colIdx].isCompatible(IntValue.class)) {
                DataCell dc = cellFactory.createDataCellOfType(IntCell.TYPE, token);
                if (dc != null) {
                    gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
                    continue;
                }
                // not an integer - could it be the missing value?
                if (missValPattern[colIdx] == null) {
                    // we accept one token that can't be
                    // parsed per column - but we don't use doubles
                    // as missing value! Would be odd.
                    dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
                    if (dc == null) {
                        missValPattern[colIdx] = token;
                        continue;
                    }
                }
                // not an integer, not the missing value
                // - could be a double
                types[colIdx] = DoubleCell.TYPE;
            }
            if (types[colIdx].isCompatible(DoubleValue.class)) {
                DataCell dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
                if (dc != null) {
                    gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
                    continue;
                }
                // not a double - missing value maybe?
                if (missValPattern[colIdx] == null) {
                    // we accept one token that can't be parsed
                    // per column as missing value pattern
                    missValPattern[colIdx] = token;
                    continue;
                }
                // not a double, not a missing value,
                // lets accept everything: StringCell
                types[colIdx] = StringCell.TYPE;
                gotValue[colIdx] = true;
            }
        }
    } finally {
        tokenizer.closeSourceStream();
    }
    // set all columns we didn't see any real value for to String.
    // Discard any (possible) missing value pattern (that works,
    // because we don't accept doubles as missing value patterns).
    // Warn the user.
    String cols = "";
    int cnt = 0;
    for (int t = 0; t < types.length; t++) {
        if (userTypes[t] == null && !gotValue[t]) {
            // do it only for types not set by the user
            assert types[t].equals(IntCell.TYPE);
            types[t] = StringCell.TYPE;
            boolean gotOneVal = missValPattern[t] != null;
            missValPattern[t] = null;
            if ((cnt < 21) && !gotOneVal && ((userColProps == null) || (userColProps.size() <= t) || (userColProps.get(t) == null) || (!userColProps.get(t).getSkipThisColumn()))) {
                if (cnt < 20) {
                    cols += "#" + t + ", ";
                    cnt++;
                } else if (cnt == 20) {
                    cols += "...and more..., ";
                    cnt++;
                }
            }
        }
    }
    if (cols.length() > 0) {
        LOGGER.warn("Didn't get any value for column(s) with index " + // cut off the comma
        cols.substring(0, cols.length() - 2) + ". Please verify column type(s).");
    }
    // pack column types and column missing values in one object
    ColProperty[] colPropResult = new ColProperty[types.length];
    for (int c = 0; c < colPropResult.length; c++) {
        ColProperty cp = new ColProperty();
        DataColumnSpecCreator dcsc = new DataColumnSpecCreator("Foo", types[c]);
        cp.setColumnSpec(dcsc.createSpec());
        if (types[c].equals(StringCell.TYPE)) {
            // use the global one, if set, otherwise '?'
            if (result.getMissValuePatternStrCols() != null) {
                cp.setMissingValuePattern(result.getMissValuePatternStrCols());
            } else {
                cp.setMissingValuePattern("?");
            }
        } else {
            // for int or double, use the one we figured out (or none)
            cp.setMissingValuePattern(missValPattern[c]);
        }
        colPropResult[c] = cp;
    }
    return colPropResult;
}
Also used : DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) BufferedFileReader(org.knime.base.node.util.BufferedFileReader) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataType(org.knime.core.data.DataType) DataCell(org.knime.core.data.DataCell) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Aggregations

Tokenizer (org.knime.core.util.tokenizer.Tokenizer)13 DataCell (org.knime.core.data.DataCell)7 StringReader (java.io.StringReader)5 BufferedFileReader (org.knime.base.node.util.BufferedFileReader)5 DataType (org.knime.core.data.DataType)4 TokenizerSettings (org.knime.core.util.tokenizer.TokenizerSettings)3 BufferedReader (java.io.BufferedReader)2 ArrayList (java.util.ArrayList)2 LinkedHashSet (java.util.LinkedHashSet)2 DataColumnSpec (org.knime.core.data.DataColumnSpec)2 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)2 DataTableSpec (org.knime.core.data.DataTableSpec)2 StringCell (org.knime.core.data.def.StringCell)2 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)2 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 HashSet (java.util.HashSet)1 Vector (java.util.Vector)1 DataColumnDomainCreator (org.knime.core.data.DataColumnDomainCreator)1