Examples with Tokenizer - org.knime.core.util.tokenizer.Tokenizer

Example 6 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class ARFFTable method extractNominalVals.

/*
     * expects the list of nominal values (in curely braces and comma separated)
     * from the "@attribute" line to be next in the tokenizer (including the
     * beginning of the list with the iopening brace). Will return an array of
     * StringsCells with the different values extracted (and removed) from the
     * tokenizer. It will leave the EOL at the end of the list in the tokenizer.
     * Pass in also file name for nice error messages.
     */
private static DataCell[] extractNominalVals(final String valList, final String fileName, final int lineNo) throws InvalidSettingsException {
    Collection<DataCell> vals = new LinkedHashSet<DataCell>();
    // we must support quotes and stuff - let's use another tokenizer.
    StringReader strReader = new StringReader(valList);
    Tokenizer tokizer = new Tokenizer(strReader);
    TokenizerSettings tokSets = new TokenizerSettings();
    tokSets.addDelimiterPattern(",", false, false, false);
    tokSets.addQuotePattern("'", "'");
    tokSets.addQuotePattern("\"", "\"");
    tokizer.setSettings(tokSets);
    for (String val = tokizer.nextToken(); val != null; val = tokizer.nextToken()) {
        String newval = val;
        // trimm off any whitespaces.
        if (!tokizer.lastTokenWasQuoted()) {
            newval = val.trim();
        }
        // make sure we don't add the same value twice.
        StringCell newValCell = new StringCell(newval);
        if (!vals.contains(newValCell)) {
            vals.add(newValCell);
        } else {
            LOGGER.warn("ARFF reader WARNING: The list of nominal " + "values in the header of file '" + fileName + "' line " + lineNo + " contains the value '" + newval + "' twice. Ignoring one appearance.");
        }
    }
    return vals.toArray(new DataCell[vals.size()]);
}

Also used : LinkedHashSet(java.util.LinkedHashSet) TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings) StringCell(org.knime.core.data.def.StringCell) StringReader(java.io.StringReader) DataCell(org.knime.core.data.DataCell) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 7 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class ARFFTable method createDataTableSpecFromARFFfile.

/**
 * Reads in the header of the specified ARFF file and returns a
 * corresponding table spec object.
 *
 * @param fileLoc the location of the ARFF file to read
 * @param exec to enable users to cancel this process
 * @return a table spec reflecting the settings in the file header
 * @throws IOException if the file location couldn't be opened
 * @throws InvalidSettingsException if the file contains an invalid format
 * @throws CanceledExecutionException if user canceled
 */
public static DataTableSpec createDataTableSpecFromARFFfile(final URL fileLoc, final ExecutionMonitor exec) throws IOException, InvalidSettingsException, CanceledExecutionException {
    // create a tokenizer to read the header
    InputStream inStream = FileUtil.openStreamWithTimeout(fileLoc);
    Tokenizer tokenizer = new Tokenizer(new BufferedReader(new InputStreamReader(inStream)));
    // create tokenizer settings that will deliver us the attributes and
    // arguments as tokens.
    tokenizer.setSettings(getTokenizerHeaderSettings());
    // prepare for creating a column spec for each "@attribute" read
    Vector<DataColumnSpec> colSpecs = new Vector<DataColumnSpec>();
    String tableName = null;
    String token;
    // the data section begins.
    while (true) {
        if (exec != null) {
            // throws exception if user canceled.
            exec.checkCanceled();
        }
        DataCell[] possVals = null;
        DataType type;
        token = tokenizer.nextToken();
        if (token == null) {
            throw new InvalidSettingsException("Incorrect/Incomplete " + "ARFF file. No data section found.");
        }
        if (token.length() == 0) {
            // ignore empty lines
            continue;
        }
        if (token.equalsIgnoreCase("@DATA")) {
            // this starts the data section: we are done.
            break;
        }
        if (token.equalsIgnoreCase("@ATTRIBUTE")) {
            // defines a new data column
            String colName = tokenizer.nextToken();
            String colType = null;
            if (tokenizer.lastTokenWasQuoted() && tokenizer.getLastQuoteBeginPattern().equals("{")) {
                // name. Extract it from there and set it in the 'colType'
                if (colName.charAt(0) == '{') {
                    // seems we only got a value list.
                    // The col name must be empty/missing then...
                    colType = colName;
                    colName = null;
                } else {
                    int openBraceIdx = colName.indexOf('{');
                    int closeBraceIdx = colName.lastIndexOf('}');
                    colType = colName.substring(openBraceIdx + 1, closeBraceIdx);
                    colName = colName.substring(0, openBraceIdx);
                // we ignore everything after the nominal value list
                }
            } else {
                colType = tokenizer.nextToken();
            }
            if ((colName == null) || (colType == null)) {
                throw new InvalidSettingsException("Incomplete '@attribute' statement at line " + tokenizer.getLineNumber() + " in ARFF file '" + fileLoc + "'.");
            }
            // start the 'if' thing here.
            if (colType.equalsIgnoreCase("NUMERIC") || colType.equalsIgnoreCase("REAL")) {
                type = DoubleCell.TYPE;
                // ignore whatever still comes in that line, warn though
                readUntilEOL(tokenizer, fileLoc.toString());
            } else if (colType.equalsIgnoreCase("INTEGER")) {
                type = IntCell.TYPE;
                // ignore whatever still comes in that line, warn though
                readUntilEOL(tokenizer, fileLoc.toString());
            } else if (colType.equalsIgnoreCase("STRING")) {
                type = StringCell.TYPE;
                // ignore whatever still comes in that line, warn though
                readUntilEOL(tokenizer, fileLoc.toString());
            } else if (colType.equalsIgnoreCase("DATE")) {
                // we use string cell for date ...
                type = StringCell.TYPE;
                // ignore whatever date format is specified
                readUntilEOL(tokenizer, null);
            } else if (tokenizer.lastTokenWasQuoted() && tokenizer.getLastQuoteBeginPattern().equals("{")) {
                // the braces should be still in the string
                int openBraceIdx = colType.indexOf('{');
                int closeBraceIdx = colType.lastIndexOf('}');
                if ((openBraceIdx >= 0) && (closeBraceIdx > 0) && (openBraceIdx < closeBraceIdx)) {
                    colType = colType.substring(openBraceIdx + 1, closeBraceIdx);
                }
                // the type was a list of nominal values
                possVals = extractNominalVals(colType, fileLoc.toString(), tokenizer.getLineNumber());
                // KNIME uses string cells for nominal values.
                type = StringCell.TYPE;
                readUntilEOL(tokenizer, fileLoc.toString());
            } else {
                throw new InvalidSettingsException("Invalid column type" + " '" + colType + "' in attribute control " + "statement in ARFF file '" + fileLoc + "' at line " + tokenizer.getLineNumber() + ".");
            }
            DataColumnSpecCreator dcsc = new DataColumnSpecCreator(colName, type);
            if (possVals != null) {
                dcsc.setDomain(new DataColumnDomainCreator(possVals).createDomain());
            }
            colSpecs.add(dcsc.createSpec());
        } else if (token.equalsIgnoreCase("@RELATION")) {
            tableName = tokenizer.nextToken();
            if (tableName == null) {
                throw new InvalidSettingsException("Incomplete '@relation' statement at line " + tokenizer.getLineNumber() + " in ARFF file '" + fileLoc + "'.");
            }
            // we just ignore the name of the data set.
            readUntilEOL(tokenizer, null);
        } else if (token.charAt(0) == '@') {
            // OOps. What's that?!?
            LOGGER.warn("ARFF reader WARNING: Unsupported control " + "statement '" + token + "' in line " + tokenizer.getLineNumber() + ". Ignoring it! File: " + fileLoc);
            readUntilEOL(tokenizer, null);
        } else if (!token.equals("\n")) {
            LOGGER.warn("ARFF reader WARNING: Unsupported " + "statement '" + token + "' in header of ARFF file '" + fileLoc + "', line " + tokenizer.getLineNumber() + ". Ignoring it!");
            readUntilEOL(tokenizer, null);
        }
    // else ignore empty lines
    }
    // end of while (not EOF)
    // check uniqueness of column names
    HashSet<String> colNames = new HashSet<>();
    for (int c = 0; c < colSpecs.size(); c++) {
        if (!colNames.add(colSpecs.get(c).getName())) {
            throw new InvalidSettingsException("Two attributes with equal names defined in header of file '" + fileLoc + "'.");
        }
    }
    return new DataTableSpec(tableName, colSpecs.toArray(new DataColumnSpec[colSpecs.size()]));
}

Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) DataColumnSpec(org.knime.core.data.DataColumnSpec) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) BufferedReader(java.io.BufferedReader) DataCell(org.knime.core.data.DataCell) DataType(org.knime.core.data.DataType) Tokenizer(org.knime.core.util.tokenizer.Tokenizer) Vector(java.util.Vector) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 8 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class FileAnalyzer method testDelimiterSettingsSetColNum.

/*
     * With the new "ignore empty tokens at end of row" option this got a bit
     * more complicated: We need to keep a range of numberOfColumns that we can
     * accept. The lower bound will be the number of non-empty columns we read
     * so far (because this is the minimum all rows must have), the maximum will
     * be the non-empty plus empty columns we have seen so far. The reason for
     * that is, we may need some of these empty tokens at the end of a row to
     * fill the row, in case a later row has more (non-empty) tokens.
     */
private static boolean testDelimiterSettingsSetColNum(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(settings);
    long fileSize = reader.getFileSize();
    int linesRead = 0;
    // column counter per line
    int columns = 0;
    // num of cols with these settings
    int numOfCols = -1;
    // num of cols incl. some empty tokens at EOR
    int maxNumOfCols = -1;
    // set it true to use these settings.
    boolean useSettings = false;
    // consecutive empty tokens read
    int consEmptyTokens = 0;
    boolean lastTokenWasDelimited = false;
    while (true) {
        if ((settings.getMaximumNumberOfRowsToRead() > -1) && (linesRead >= settings.getMaximumNumberOfRowsToRead())) {
            break;
        }
        String token = tokenizer.nextToken();
        if (fileSize > 0) {
            exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
        }
        if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
            columns++;
            lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
            // keep track of the empty tokens read.
            if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
                consEmptyTokens++;
            } else {
                consEmptyTokens = 0;
            }
        } else {
            if (columns > 0) {
                // ignore empty lines
                linesRead++;
                try {
                    if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
                        // cutItShort also checks for interrupts
                        settings.setAnalyzeUsedAllRows(false);
                        break;
                    }
                } catch (InterruptedExecutionException iee) {
                    tokenizer.closeSourceStream();
                    throw iee;
                }
                if (token == null && lastTokenWasDelimited) {
                    columns++;
                }
                if (linesRead > 1) {
                    if (numOfCols < 1) {
                        // for
                        if (settings.ignoreEmptyTokensAtEndOfRow()) {
                            // these are the "hard" columns we need
                            numOfCols = columns - consEmptyTokens;
                            // we could fill up to this number with empty
                            // tokens
                            maxNumOfCols = columns;
                            if (numOfCols > 1) {
                                // if we get more than one col settings
                                // look reasonable
                                useSettings = true;
                            }
                        } else {
                            numOfCols = columns;
                            if (numOfCols <= 1) {
                                // we don't need this delimiter if we put
                                // everything in one column
                                useSettings = false;
                                break;
                            }
                            useSettings = true;
                        }
                    } else {
                        if (settings.ignoreEmptyTokensAtEndOfRow()) {
                            if ((columns - consEmptyTokens) > maxNumOfCols) {
                                // we read more non-empty columns than we
                                // could
                                // fill (in other rows) with empty tokens
                                useSettings = false;
                                break;
                            }
                            if (columns < numOfCols) {
                                // even with empty tokens this line has not
                                // enough columns
                                useSettings = false;
                                break;
                            }
                            if (columns < maxNumOfCols) {
                                // "maxNumOfCols" is the maximum number all
                                // rows can deliver.
                                maxNumOfCols = columns;
                            }
                            if ((columns - consEmptyTokens) > numOfCols) {
                                // Adjust the number of "hard" columns
                                numOfCols = columns - consEmptyTokens;
                                if (numOfCols > 1) {
                                    useSettings = true;
                                }
                            }
                            // cols
                            assert numOfCols <= maxNumOfCols;
                        } else {
                            // cols
                            if (columns != numOfCols) {
                                // not good. Getting different number of
                                // columns in different lines.
                                useSettings = false;
                                break;
                            }
                        }
                    }
                }
            }
            consEmptyTokens = 0;
            columns = 0;
            lastTokenWasDelimited = false;
            if (token == null) {
                // seen end of file.
                break;
            }
        }
    }
    tokenizer.closeSourceStream();
    if (useSettings) {
        settings.setNumberOfColumns(numOfCols);
    }
    return useSettings;
}

Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 9 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class FileAnalyzer method getMaximumNumberOfColumns.

private static int getMaximumNumberOfColumns(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(settings);
    double fileSize = reader.getFileSize();
    // non-empty lines
    int dataLinesRead = 0;
    // the counter per line
    int colCount = 0;
    // the maximum
    int numOfCols = 0;
    // consecutive empty tokens
    int consEmptyTokens = 0;
    // remember it, in case the last token in the file has no delimiter
    boolean lastTokenWasDelimited = false;
    try {
        while (true) {
            String token = tokenizer.nextToken();
            if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                colCount++;
                lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
                // keep track of the empty tokens read.
                if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
                    consEmptyTokens++;
                } else {
                    consEmptyTokens = 0;
                }
            } else {
                // null token (=EOF) is a row delimiter
                if (colCount > 0) {
                    // ignore empty lines
                    dataLinesRead++;
                }
                if (token == null && colCount < numOfCols && lastTokenWasDelimited) {
                    // if the last line has no LF, EOF is delimits the last column
                    colCount++;
                }
                if (settings.ignoreEmptyTokensAtEndOfRow()) {
                    // we are looking for the maximum - those empty tokens
                    // should not contribute to it.
                    colCount -= consEmptyTokens;
                }
                if (colCount > numOfCols) {
                    // we are supposed to return the maximum
                    numOfCols = colCount;
                    settings.setColumnNumDeterminingLineNumber(tokenizer.getLineNumber());
                }
                colCount = 0;
                consEmptyTokens = 0;
                if (token == null) {
                    break;
                }
                if (settings.getMaximumNumberOfRowsToRead() > -1) {
                    if (tokenizer.getLineNumber() > settings.getSkipFirstLines() + settings.getMaximumNumberOfRowsToRead()) {
                        break;
                    }
                }
                if (cutItShort(exec)) {
                    // cutItShort also checks for interrupts
                    if (dataLinesRead >= getShortCutLines(exec)) {
                        settings.setAnalyzeUsedAllRows(false);
                        break;
                    }
                    exec.setProgress(dataLinesRead / (double) getShortCutLines(exec));
                } else {
                    if (fileSize > 0) {
                        exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
                    }
                }
            }
        }
    } finally {
        tokenizer.closeSourceStream();
    }
    return numOfCols;
}

Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 10 with Tokenizer

use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.

the class FileAnalyzer method checkRowHeader.

/**
 * Looks at the first token of each line (except the first line) and returns true if they are all prefixed by the
 * same (possibly empty) string followed by a constantly incremented number.
 *
 * @param settings the file to look at with corresponding settings
 * @return true if it's reasonable to assume the file has row headers
 * @throws IOException if an I/O error occurs
 * @throws InterruptedExecutionException if analysis should be interrupted immediately
 */
private static boolean checkRowHeader(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
    BufferedFileReader reader = settings.createNewInputReader();
    final double fileSize = reader.getFileSize();
    long linesRead = 0;
    exec.setProgress("Guessing row IDs");
    Tokenizer tokenizer = new Tokenizer(reader);
    tokenizer.setSettings(settings);
    String token;
    HeaderHelper helper = null;
    boolean firstTokenInRow = true;
    try {
        while (true) {
            token = tokenizer.nextToken();
            if (token == null) {
                // end of file
                break;
            }
            if (firstTokenInRow && settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                // ignore empty rows
                continue;
            }
            if (firstTokenInRow) {
                firstTokenInRow = false;
                if (linesRead > 0) {
                    // we ignore the first line (could be col header line)
                    if (helper == null) {
                        // the first row ID we see
                        helper = HeaderHelper.extractPrefixAndIndexFromHeader(token);
                        if (helper == null) {
                            // that's not row header material
                            return false;
                        }
                    } else {
                        // all other header must match the first one
                        if (!helper.testNextHeader(token)) {
                            return false;
                        }
                    }
                }
            } else {
                // swallow all tokens except new line delimiters
                if (settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
                    // the next token is the first
                    firstTokenInRow = true;
                    linesRead++;
                    if (cutItShort(exec)) {
                        if (linesRead > getShortCutLines(exec)) {
                            break;
                        }
                        exec.setProgress(linesRead / (double) getShortCutLines(exec));
                    } else {
                        if (fileSize > 0) {
                            exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
                        }
                    }
                }
            }
        }
    } finally {
        tokenizer.closeSourceStream();
    }
    return true;
}

Also used : BufferedFileReader(org.knime.base.node.util.BufferedFileReader) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Aggregations

Tokenizer (org.knime.core.util.tokenizer.Tokenizer)13 DataCell (org.knime.core.data.DataCell)7 StringReader (java.io.StringReader)5 BufferedFileReader (org.knime.base.node.util.BufferedFileReader)5 DataType (org.knime.core.data.DataType)4 TokenizerSettings (org.knime.core.util.tokenizer.TokenizerSettings)3 BufferedReader (java.io.BufferedReader)2 ArrayList (java.util.ArrayList)2 LinkedHashSet (java.util.LinkedHashSet)2 DataColumnSpec (org.knime.core.data.DataColumnSpec)2 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)2 DataTableSpec (org.knime.core.data.DataTableSpec)2 StringCell (org.knime.core.data.def.StringCell)2 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)2 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 HashSet (java.util.HashSet)1 Vector (java.util.Vector)1 DataColumnDomainCreator (org.knime.core.data.DataColumnDomainCreator)1