Examples with TokenizerSettings - org.knime.core.util.tokenizer.TokenizerSettings

Example 1 with TokenizerSettings

use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.

the class CellSplitterCellFactory method createNewColumnTypes.

/**
 * Analyzes the values in the user selected column and tries to figure out
 * how many columns are needed to hold the splitted values and of which type
 * the new resulting column have to be. <br>
 * If the "output as list" or "output as set" flag IS set in the settings
 * object it returns one as column number, since only one collection cell
 * is needed to store the output.
 * If the "guess" flag in the settings object is NOT set, it returns the
 * column number entered by the user and string type for all columns.
 * Otherwise it runs once through the entire table, splits the value of the
 * selected column, stores the maximum number of parts received, and tries
 * to convert each part into an int (first), then into a double, and if both
 * fails it sets string type for the corresponding column.
 *
 * @param table the table with the column to examine (can be null, if no
 *            type guessing is required)
 * @param userSettings user settings
 * @param exec the execution context to set progress and check for cancel
 *            (can be null)
 * @return a settings object containing the same settings as the ones passed
 *         in and in addition the type (and number) of each column to add
 * @throws CanceledExecutionException if user cancels
 */
static CellSplitterSettings createNewColumnTypes(final BufferedDataTable table, final CellSplitterUserSettings userSettings, final ExecutionContext exec) throws CanceledExecutionException {
    // make sure we have settings we can deal with
    DataTableSpec spec = null;
    if (table != null) {
        spec = table.getDataTableSpec();
    }
    String msg = userSettings.getStatus(spec);
    if (msg != null) {
        // don't call this with invalid settings
        assert false;
        throw new IllegalStateException(msg);
    }
    // transfer the user settings into a new settings object (the result)
    CellSplitterSettings result;
    NodeSettings tmp = new NodeSettings("tmp");
    userSettings.saveSettingsTo(tmp);
    try {
        result = new CellSplitterSettings(tmp);
    } catch (InvalidSettingsException ise) {
        // the getStatus should have covered any invalidities
        throw new IllegalStateException(ise.getMessage());
    }
    /*
         * not guessing types: output as columns
         */
    if (!userSettings.isGuessNumOfCols() && userSettings.isOutputAsCols()) {
        // we are not supposed to analyze the file.
        for (int col = 0; col < userSettings.getNumOfCols(); col++) {
            // create as many string columns as the user set
            result.addColumnOfType(StringCell.TYPE);
        }
        return result;
    }
    /*
         * not guessing types: output as list or set
         */
    if (userSettings.isOutputAsList() || userSettings.isOutputAsSet()) {
        DataType colType = null;
        // list cell type
        if (userSettings.isOutputAsList()) {
            colType = ListCell.getCollectionType(StringCell.TYPE);
        // set cell type otherwise (there is no other option left)
        } else {
            colType = SetCell.getCollectionType(StringCell.TYPE);
        }
        result.addColumnOfType(colType);
        return result;
    }
    /*
         * analyze table
         */
    int colIdx = table.getDataTableSpec().findColumnIndex(userSettings.getColumnName());
    if (colIdx < 0) {
        // the status should have checked this!
        assert false;
        throw new IllegalStateException("Input table doesn't contain selected column");
    }
    TokenizerSettings tokenizerSettings = createTokenizerSettings(userSettings);
    if (tokenizerSettings == null) {
        throw new IllegalStateException("Incorrect user settings");
    }
    long rowCnt = 0;
    long numOfRows = table.size();
    for (DataRow row : table) {
        rowCnt++;
        String inputString = "";
        DataCell inputCell = row.getCell(colIdx);
        if (inputCell.isMissing()) {
            // missing cells don't help determining the target types
            continue;
        }
        if (inputCell instanceof StringValue) {
            inputString = ((StringValue) inputCell).getStringValue();
        } else {
            inputString = inputCell.toString();
        }
        // init the tokenizer
        StringReader inputReader = new StringReader(inputString);
        // the reader is no good if it doesn't support the mark operation
        assert inputReader.markSupported();
        Tokenizer tokenizer = new Tokenizer(inputReader);
        tokenizer.setSettings(tokenizerSettings);
        int addedColIdx = -1;
        // read tokens from the input, analyze the tokens and set the type
        while (true) {
            String token = tokenizer.nextToken();
            addedColIdx++;
            if (token == null) {
                // done with that input string from that row
                break;
            }
            token = token.trim();
            DataType colType = IntCell.TYPE;
            // if we already got that many columns, verify the type
            if (addedColIdx < result.getNumOfColsGuessed()) {
                colType = result.getTypeOfColumn(addedColIdx);
            } else {
                // otherwise init the type with int
                result.addColumnOfType(colType);
            }
            if (colType.equals(IntCell.TYPE)) {
                // try converting it to an integer
                try {
                    Integer.parseInt(token);
                } catch (NumberFormatException nfe) {
                    // that wasn't really an integer. Try double.
                    colType = DoubleCell.TYPE;
                }
            }
            if (colType.equals(DoubleCell.TYPE)) {
                // try converting it to a double
                try {
                    Double.parseDouble(token);
                } catch (NumberFormatException nfe) {
                    // that wasn't really a double. Use string.
                    colType = StringCell.TYPE;
                }
            }
            // write back the type
            result.replaceTypeOfColumn(addedColIdx, colType);
        }
        if (exec != null) {
            exec.checkCanceled();
            exec.setProgress((double) rowCnt / (double) numOfRows, "Analyzing row #" + rowCnt + " of " + numOfRows);
        }
    }
    /*
         * if the input table contained missing values only, we end up with no
         * column to add. Throw an exception.
         */
    if (result.getNumOfColsGuessed() < 1) {
        throw new IllegalStateException("Data analysis computed no " + "columns to add (happens if input table is empty or " + "has only missing values).\n" + "Please set the array size manually.");
    }
    return result;
}

Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataRow(org.knime.core.data.DataRow) NodeSettings(org.knime.core.node.NodeSettings) TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) StringReader(java.io.StringReader) DataType(org.knime.core.data.DataType) DataCell(org.knime.core.data.DataCell) StringValue(org.knime.core.data.StringValue) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 2 with TokenizerSettings

use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.

the class CellSplitterCellFactory method createTokenizerSettings.

private static TokenizerSettings createTokenizerSettings(final CellSplitterUserSettings userSettings) {
    if (userSettings == null) {
        return null;
    }
    if ((userSettings.getDelimiter() == null) || (userSettings.getDelimiter().length() == 0)) {
        return null;
    }
    TokenizerSettings result = new TokenizerSettings();
    String delim = userSettings.getDelimiter();
    if (userSettings.isUseEscapeCharacter()) {
        delim = StringEscapeUtils.unescapeJava(delim);
    }
    result.addDelimiterPattern(delim, /* combineConsecutive */
    false, /* returnAsSeperateToken */
    false, /* includeInToken */
    false);
    String quote = userSettings.getQuotePattern();
    if ((quote != null) && (quote.length() > 0)) {
        result.addQuotePattern(quote, quote, '\\', userSettings.isRemoveQuotes());
    }
    return result;
}

Also used : TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings)

Example 3 with TokenizerSettings

use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.

the class ARFFTable method getTokenizerHeaderSettings.

// createDataTableSpecFromARFFfile(URL)
/*
     * returns a settings object used to read the ARFF file header.
     */
private static TokenizerSettings getTokenizerHeaderSettings() {
    TokenizerSettings settings = new TokenizerSettings();
    // add the ARFF single line comment
    settings.addSingleLineCommentPattern("%", false, false);
    // LF is a row seperator - add it as delimiter
    settings.addDelimiterPattern("\n", /* combine multiple= */
    true, /* return as token= */
    true, /* include in token= */
    false);
    // ARFF knows single and double quotes
    settings.addQuotePattern("'", "'");
    settings.addQuotePattern("\"", "\"");
    // the nominal values list will be quoted into one token (but the
    // braces must stay in)
    settings.addQuotePattern("{", "}", true);
    // the attribute statement and arguments are separated by space(s)
    settings.addDelimiterPattern(" ", true, false, false);
    // or tabs
    settings.addDelimiterPattern("\t", true, false, false);
    // and a combination of them
    settings.setCombineMultipleDelimiters(true);
    return settings;
}

Also used : TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings)

Example 4 with TokenizerSettings

use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.

the class ARFFTable method extractNominalVals.

/*
     * expects the list of nominal values (in curely braces and comma separated)
     * from the "@attribute" line to be next in the tokenizer (including the
     * beginning of the list with the iopening brace). Will return an array of
     * StringsCells with the different values extracted (and removed) from the
     * tokenizer. It will leave the EOL at the end of the list in the tokenizer.
     * Pass in also file name for nice error messages.
     */
private static DataCell[] extractNominalVals(final String valList, final String fileName, final int lineNo) throws InvalidSettingsException {
    Collection<DataCell> vals = new LinkedHashSet<DataCell>();
    // we must support quotes and stuff - let's use another tokenizer.
    StringReader strReader = new StringReader(valList);
    Tokenizer tokizer = new Tokenizer(strReader);
    TokenizerSettings tokSets = new TokenizerSettings();
    tokSets.addDelimiterPattern(",", false, false, false);
    tokSets.addQuotePattern("'", "'");
    tokSets.addQuotePattern("\"", "\"");
    tokizer.setSettings(tokSets);
    for (String val = tokizer.nextToken(); val != null; val = tokizer.nextToken()) {
        String newval = val;
        // trimm off any whitespaces.
        if (!tokizer.lastTokenWasQuoted()) {
            newval = val.trim();
        }
        // make sure we don't add the same value twice.
        StringCell newValCell = new StringCell(newval);
        if (!vals.contains(newValCell)) {
            vals.add(newValCell);
        } else {
            LOGGER.warn("ARFF reader WARNING: The list of nominal " + "values in the header of file '" + fileName + "' line " + lineNo + " contains the value '" + newval + "' twice. Ignoring one appearance.");
        }
    }
    return vals.toArray(new DataCell[vals.size()]);
}

Also used : LinkedHashSet(java.util.LinkedHashSet) TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings) StringCell(org.knime.core.data.def.StringCell) StringReader(java.io.StringReader) DataCell(org.knime.core.data.DataCell) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 5 with TokenizerSettings

use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.

the class BatchExecutor method splitWorkflowVariableArg.

/**
 * Splits the argument to -workflow.variable into its sub-components (name, value, type) and returns it as array.
 *
 * @param arg The string to split
 * @return The components of the string, no validation is done.
 * @since 2.11
 */
public static String[] splitWorkflowVariableArg(final String arg) {
    Tokenizer tokenizer = new Tokenizer(new StringReader(arg));
    TokenizerSettings settings = new TokenizerSettings();
    settings.addQuotePattern("\"", "\"", '\\');
    settings.addQuotePattern("'", "'", '\\');
    settings.addDelimiterPattern(",", /* combine multiple= */
    false, /* return as token= */
    false, /* include in token= */
    false);
    tokenizer.setSettings(settings);
    ArrayList<String> tokenList = new ArrayList<String>();
    String token;
    while ((token = tokenizer.nextToken()) != null) {
        tokenList.add(token);
    }
    return tokenList.toArray(new String[tokenList.size()]);
}

Also used : TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Aggregations

TokenizerSettings (org.knime.core.util.tokenizer.TokenizerSettings)5 StringReader (java.io.StringReader)3 Tokenizer (org.knime.core.util.tokenizer.Tokenizer)3 DataCell (org.knime.core.data.DataCell)2 ArrayList (java.util.ArrayList)1 LinkedHashSet (java.util.LinkedHashSet)1 DataRow (org.knime.core.data.DataRow)1 DataTableSpec (org.knime.core.data.DataTableSpec)1 DataType (org.knime.core.data.DataType)1 StringValue (org.knime.core.data.StringValue)1 StringCell (org.knime.core.data.def.StringCell)1 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)1 NodeSettings (org.knime.core.node.NodeSettings)1