Search in sources :

Example 11 with StringValue

use of org.knime.core.data.StringValue in project knime-core by knime.

the class CreateBitVectorNodeModel method scanMaxPos.

private int scanMaxPos(final BufferedDataTable data, final ExecutionMonitor exec) throws CanceledExecutionException {
    int maxPos = Integer.MIN_VALUE;
    int cellIdx = data.getDataTableSpec().findColumnIndex(m_singleColumn.getStringValue());
    long nrRows = data.size();
    long currRow = 0;
    for (DataRow row : data) {
        currRow++;
        exec.setProgress((double) currRow / (double) nrRows, "processing row " + currRow + " of " + nrRows);
        exec.checkCanceled();
        DataCell cell = row.getCell(cellIdx);
        if (cell.isMissing()) {
            continue;
        }
        if (cell instanceof StringValue) {
            final String toParse = ((StringValue) cell).getStringValue();
            final String[] numbers = toParse.split("\\s");
            for (int i = 0; i < numbers.length; i++) {
                int pos = -1;
                try {
                    pos = Integer.parseInt(numbers[i].trim());
                    maxPos = Math.max(maxPos, pos);
                } catch (NumberFormatException nfe) {
                // nothing to do here
                // same exception will be logged from cell factory
                }
            }
        } else {
            throw new RuntimeException("Found incompatible type in row " + row.getKey().getString());
        }
    }
    return maxPos + 1;
}
Also used : DataCell(org.knime.core.data.DataCell) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) StringValue(org.knime.core.data.StringValue) DataRow(org.knime.core.data.DataRow)

Example 12 with StringValue

use of org.knime.core.data.StringValue in project knime-core by knime.

the class CellSplitterCellFactory method createNewColumnTypes.

/**
 * Analyzes the values in the user selected column and tries to figure out
 * how many columns are needed to hold the splitted values and of which type
 * the new resulting column have to be. <br>
 * If the "output as list" or "output as set" flag IS set in the settings
 * object it returns one as column number, since only one collection cell
 * is needed to store the output.
 * If the "guess" flag in the settings object is NOT set, it returns the
 * column number entered by the user and string type for all columns.
 * Otherwise it runs once through the entire table, splits the value of the
 * selected column, stores the maximum number of parts received, and tries
 * to convert each part into an int (first), then into a double, and if both
 * fails it sets string type for the corresponding column.
 *
 * @param table the table with the column to examine (can be null, if no
 *            type guessing is required)
 * @param userSettings user settings
 * @param exec the execution context to set progress and check for cancel
 *            (can be null)
 * @return a settings object containing the same settings as the ones passed
 *         in and in addition the type (and number) of each column to add
 * @throws CanceledExecutionException if user cancels
 */
static CellSplitterSettings createNewColumnTypes(final BufferedDataTable table, final CellSplitterUserSettings userSettings, final ExecutionContext exec) throws CanceledExecutionException {
    // make sure we have settings we can deal with
    DataTableSpec spec = null;
    if (table != null) {
        spec = table.getDataTableSpec();
    }
    String msg = userSettings.getStatus(spec);
    if (msg != null) {
        // don't call this with invalid settings
        assert false;
        throw new IllegalStateException(msg);
    }
    // transfer the user settings into a new settings object (the result)
    CellSplitterSettings result;
    NodeSettings tmp = new NodeSettings("tmp");
    userSettings.saveSettingsTo(tmp);
    try {
        result = new CellSplitterSettings(tmp);
    } catch (InvalidSettingsException ise) {
        // the getStatus should have covered any invalidities
        throw new IllegalStateException(ise.getMessage());
    }
    /*
         * not guessing types: output as columns
         */
    if (!userSettings.isGuessNumOfCols() && userSettings.isOutputAsCols()) {
        // we are not supposed to analyze the file.
        for (int col = 0; col < userSettings.getNumOfCols(); col++) {
            // create as many string columns as the user set
            result.addColumnOfType(StringCell.TYPE);
        }
        return result;
    }
    /*
         * not guessing types: output as list or set
         */
    if (userSettings.isOutputAsList() || userSettings.isOutputAsSet()) {
        DataType colType = null;
        // list cell type
        if (userSettings.isOutputAsList()) {
            colType = ListCell.getCollectionType(StringCell.TYPE);
        // set cell type otherwise (there is no other option left)
        } else {
            colType = SetCell.getCollectionType(StringCell.TYPE);
        }
        result.addColumnOfType(colType);
        return result;
    }
    /*
         * analyze table
         */
    int colIdx = table.getDataTableSpec().findColumnIndex(userSettings.getColumnName());
    if (colIdx < 0) {
        // the status should have checked this!
        assert false;
        throw new IllegalStateException("Input table doesn't contain selected column");
    }
    TokenizerSettings tokenizerSettings = createTokenizerSettings(userSettings);
    if (tokenizerSettings == null) {
        throw new IllegalStateException("Incorrect user settings");
    }
    long rowCnt = 0;
    long numOfRows = table.size();
    for (DataRow row : table) {
        rowCnt++;
        String inputString = "";
        DataCell inputCell = row.getCell(colIdx);
        if (inputCell.isMissing()) {
            // missing cells don't help determining the target types
            continue;
        }
        if (inputCell instanceof StringValue) {
            inputString = ((StringValue) inputCell).getStringValue();
        } else {
            inputString = inputCell.toString();
        }
        // init the tokenizer
        StringReader inputReader = new StringReader(inputString);
        // the reader is no good if it doesn't support the mark operation
        assert inputReader.markSupported();
        Tokenizer tokenizer = new Tokenizer(inputReader);
        tokenizer.setSettings(tokenizerSettings);
        int addedColIdx = -1;
        // read tokens from the input, analyze the tokens and set the type
        while (true) {
            String token = tokenizer.nextToken();
            addedColIdx++;
            if (token == null) {
                // done with that input string from that row
                break;
            }
            token = token.trim();
            DataType colType = IntCell.TYPE;
            // if we already got that many columns, verify the type
            if (addedColIdx < result.getNumOfColsGuessed()) {
                colType = result.getTypeOfColumn(addedColIdx);
            } else {
                // otherwise init the type with int
                result.addColumnOfType(colType);
            }
            if (colType.equals(IntCell.TYPE)) {
                // try converting it to an integer
                try {
                    Integer.parseInt(token);
                } catch (NumberFormatException nfe) {
                    // that wasn't really an integer. Try double.
                    colType = DoubleCell.TYPE;
                }
            }
            if (colType.equals(DoubleCell.TYPE)) {
                // try converting it to a double
                try {
                    Double.parseDouble(token);
                } catch (NumberFormatException nfe) {
                    // that wasn't really a double. Use string.
                    colType = StringCell.TYPE;
                }
            }
            // write back the type
            result.replaceTypeOfColumn(addedColIdx, colType);
        }
        if (exec != null) {
            exec.checkCanceled();
            exec.setProgress((double) rowCnt / (double) numOfRows, "Analyzing row #" + rowCnt + " of " + numOfRows);
        }
    }
    /*
         * if the input table contained missing values only, we end up with no
         * column to add. Throw an exception.
         */
    if (result.getNumOfColsGuessed() < 1) {
        throw new IllegalStateException("Data analysis computed no " + "columns to add (happens if input table is empty or " + "has only missing values).\n" + "Please set the array size manually.");
    }
    return result;
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataRow(org.knime.core.data.DataRow) NodeSettings(org.knime.core.node.NodeSettings) TokenizerSettings(org.knime.core.util.tokenizer.TokenizerSettings) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) StringReader(java.io.StringReader) DataType(org.knime.core.data.DataType) DataCell(org.knime.core.data.DataCell) StringValue(org.knime.core.data.StringValue) Tokenizer(org.knime.core.util.tokenizer.Tokenizer)

Example 13 with StringValue

use of org.knime.core.data.StringValue in project knime-core by knime.

the class ColumnAutoTypeCasterNodeModel method createDateAndTimeConverter.

private SingleCellFactory createDateAndTimeConverter(final int colIdx, final DataColumnSpec colSpec) {
    return new SingleCellFactory(colSpec) {

        private final Calendar m_cal = Calendar.getInstance(TimeZone.getDefault());

        private final SimpleDateFormat m_format = new SimpleDateFormat(m_dateFormat);

        private final boolean m_hasDate;

        private final boolean m_hasTime;

        private final boolean m_hasMillis;

        {
            TimeZone timeZone = TimeZone.getTimeZone("UTC");
            m_format.setTimeZone(timeZone);
            m_cal.setTimeZone(timeZone);
            m_hasDate = m_dateFormat.contains("d");
            m_hasTime = m_dateFormat.contains("H");
            m_hasMillis = m_dateFormat.contains("S");
        }

        @Override
        public DataCell getCell(final DataRow row) {
            DataCell cell = row.getCell(colIdx);
            if (!cell.isMissing()) {
                String str = ((StringValue) cell).getStringValue();
                if (!str.equals(m_missValPat)) {
                    try {
                        m_cal.setTime(m_format.parse(str));
                        return new DateAndTimeCell(m_cal.getTimeInMillis(), m_hasDate, m_hasTime, m_hasMillis);
                    } catch (ParseException e) {
                        throw new IllegalArgumentException("Can't convert '" + str + "' to " + DateAndTimeCell.TYPE.toString() + ". In " + row.getKey() + " Column" + colIdx + ". Disable quickscan and try again.", e);
                    }
                } else {
                    return DataType.getMissingCell();
                }
            } else {
                // create MissingCell
                return DataType.getMissingCell();
            }
        }
    };
}
Also used : TimeZone(java.util.TimeZone) Calendar(java.util.Calendar) DataCell(org.knime.core.data.DataCell) DateAndTimeCell(org.knime.core.data.date.DateAndTimeCell) ParseException(java.text.ParseException) StringValue(org.knime.core.data.StringValue) SingleCellFactory(org.knime.core.data.container.SingleCellFactory) SimpleDateFormat(java.text.SimpleDateFormat) DataRow(org.knime.core.data.DataRow)

Example 14 with StringValue

use of org.knime.core.data.StringValue in project knime-core by knime.

the class ConditionalBoxPlotNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
    m_statistics = new LinkedHashMap<DataColumnSpec, double[]>();
    m_mildOutliers = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
    m_extremeOutliers = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
    double nrRows = inData[0].size();
    int rowCount = 0;
    int numericIndex = inData[0].getDataTableSpec().findColumnIndex(m_settings.numericColumn());
    int nominalIndex = inData[0].getDataTableSpec().findColumnIndex(m_settings.nominalColumn());
    Map<String, Map<Double, Set<RowKey>>> data = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
    // some default values .. if one column only has missing values.
    for (DataCell d : inData[0].getDataTableSpec().getColumnSpec(nominalIndex).getDomain().getValues()) {
        String name = ((StringValue) d).getStringValue();
        m_mildOutliers.put(name, new HashMap<Double, Set<RowKey>>());
        m_extremeOutliers.put(name, new HashMap<Double, Set<RowKey>>());
    }
    for (DataRow r : inData[0]) {
        exec.checkCanceled();
        exec.setProgress(rowCount++ / nrRows, "Separating...");
        if (!m_settings.showMissingValues()) {
            if (r.getCell(nominalIndex).isMissing()) {
                // missing cell in nominal values is unwanted?
                continue;
            }
        }
        String nominal = replaceSpaces(r.getCell(nominalIndex).toString());
        if (r.getCell(numericIndex).isMissing()) {
            // ignore missing cells in numeric column
            continue;
        }
        DoubleValue numeric = (DoubleValue) r.getCell(numericIndex);
        Map<Double, Set<RowKey>> map = data.get(nominal);
        if (map == null) {
            map = new LinkedHashMap<Double, Set<RowKey>>();
        }
        Set<RowKey> set = map.get(numeric.getDoubleValue());
        if (set == null) {
            set = new HashSet<RowKey>();
        }
        set.add(r.getKey());
        map.put(numeric.getDoubleValue(), set);
        data.put(nominal, map);
    }
    List<String> keys = new ArrayList<String>(data.keySet());
    boolean ignoreMissingValues = false;
    if (m_settings.showMissingValues() && !keys.contains(DataType.getMissingCell().toString())) {
        // we promised to create data for missing values..
        // if there aren't any.. we have to create them ourselves
        setWarningMessage("No missing values found.");
        ignoreMissingValues = true;
    }
    Collections.sort(keys);
    DataColumnSpec[] colSpecs = createColumnSpec(inData[0].getDataTableSpec().getColumnSpec(nominalIndex), ignoreMissingValues);
    if (keys.size() == 0) {
        setWarningMessage("All classes are empty.");
    }
    int dataSetNr = 0;
    // for (String d : keys) {
    for (DataColumnSpec dcs : colSpecs) {
        String d = dcs.getName();
        if (data.get(d) == null || keys.size() == 0) {
            dataSetNr++;
            continue;
        }
        exec.checkCanceled();
        exec.setProgress(dataSetNr / (double) keys.size(), "Creating statistics");
        Map<Double, Set<RowKey>> extremeOutliers = new LinkedHashMap<Double, Set<RowKey>>();
        Map<Double, Set<RowKey>> mildOutliers = new LinkedHashMap<Double, Set<RowKey>>();
        double[] stats = calculateStatistic(data.get(d), mildOutliers, extremeOutliers);
        double minimum = stats[BoxPlotNodeModel.MIN];
        double maximum = stats[BoxPlotNodeModel.MAX];
        DataColumnSpecCreator creator = new DataColumnSpecCreator(colSpecs[dataSetNr]);
        creator.setDomain(new DataColumnDomainCreator(new DoubleCell(minimum), new DoubleCell(maximum)).createDomain());
        colSpecs[dataSetNr] = creator.createSpec();
        m_statistics.put(colSpecs[dataSetNr], stats);
        m_mildOutliers.put(d, mildOutliers);
        m_extremeOutliers.put(d, extremeOutliers);
        dataSetNr++;
    }
    DataTableSpec dts = new DataTableSpec("MyTempTable", colSpecs);
    DataContainer cont = new DataContainer(dts);
    cont.close();
    m_dataArray = new DefaultDataArray(cont.getTable(), 1, 2);
    cont.dispose();
    if (ignoreMissingValues) {
        DataColumnSpec[] temp = new DataColumnSpec[colSpecs.length + 1];
        DataColumnSpec missing = new DataColumnSpecCreator(DataType.getMissingCell().toString(), DataType.getMissingCell().getType()).createSpec();
        int i = 0;
        while (missing.getName().compareTo(colSpecs[i].getName()) > 0) {
            temp[i] = colSpecs[i];
            i++;
        }
        temp[i++] = missing;
        while (i < temp.length) {
            temp[i] = colSpecs[i - 1];
            i++;
        }
        colSpecs = temp;
    }
    /* Save inSpec of the numeric column to provide the view a way to
         * consider the input domain for normalization. */
    m_numColSpec = inData[0].getDataTableSpec().getColumnSpec(numericIndex);
    return new BufferedDataTable[] { createOutputTable(inData[0].getDataTableSpec(), colSpecs, exec).getTable() };
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) HashSet(java.util.HashSet) Set(java.util.Set) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) RowKey(org.knime.core.data.RowKey) DoubleCell(org.knime.core.data.def.DoubleCell) DefaultDataArray(org.knime.base.node.util.DefaultDataArray) ArrayList(java.util.ArrayList) DataRow(org.knime.core.data.DataRow) LinkedHashMap(java.util.LinkedHashMap) DataContainer(org.knime.core.data.container.DataContainer) BufferedDataContainer(org.knime.core.node.BufferedDataContainer) DataColumnSpec(org.knime.core.data.DataColumnSpec) BufferedDataTable(org.knime.core.node.BufferedDataTable) StringValue(org.knime.core.data.StringValue) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) DoubleValue(org.knime.core.data.DoubleValue) DataCell(org.knime.core.data.DataCell) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 15 with StringValue

use of org.knime.core.data.StringValue in project knime-core by knime.

the class RegexSplitNodeModel method createColumnRearranger.

@Override
protected ColumnRearranger createColumnRearranger(final DataTableSpec spec, final SimpleStreamableOperatorInternals internals) throws InvalidSettingsException {
    AtomicInteger errorCounter = new AtomicInteger();
    if (m_settings == null) {
        throw new InvalidSettingsException("Not configuration available.");
    }
    final int colIndex = spec.findColumnIndex(m_settings.getColumn());
    if (colIndex < 0) {
        throw new InvalidSettingsException("No such column in input table: " + m_settings.getColumn());
    }
    DataColumnSpec colSpec = spec.getColumnSpec(colIndex);
    if (!colSpec.getType().isCompatible(StringValue.class)) {
        throw new InvalidSettingsException("Selected column does not " + "contain strings");
    }
    final Pattern p = m_settings.compile();
    int count = 0;
    String patternS = p.pattern();
    boolean isNextSpecial = false;
    boolean isPreviousAParenthesis = false;
    // escaped parentheses "\(" or non-capturing groups "(?"
    for (int i = 0; i < patternS.length(); i++) {
        switch(patternS.charAt(i)) {
            case '\\':
                isNextSpecial = !isNextSpecial;
                isPreviousAParenthesis = false;
                break;
            case '(':
                count += isNextSpecial ? 0 : 1;
                isPreviousAParenthesis = !isNextSpecial;
                isNextSpecial = false;
                break;
            case '?':
                if (isPreviousAParenthesis) {
                    count -= 1;
                }
            // no break;
            default:
                isNextSpecial = false;
                isPreviousAParenthesis = false;
        }
    }
    final int newColCount = count;
    final DataColumnSpec[] newColSpecs = new DataColumnSpec[count];
    for (int i = 0; i < newColCount; i++) {
        String name = DataTableSpec.getUniqueColumnName(spec, "split_" + i);
        newColSpecs[i] = new DataColumnSpecCreator(name, StringCell.TYPE).createSpec();
    }
    ColumnRearranger rearranger = new ColumnRearranger(spec);
    rearranger.append(new AbstractCellFactory(newColSpecs) {

        /**
         * {@inheritDoc}
         */
        @Override
        public DataCell[] getCells(final DataRow row) {
            DataCell[] result = new DataCell[newColCount];
            Arrays.fill(result, DataType.getMissingCell());
            DataCell c = row.getCell(colIndex);
            if (c.isMissing()) {
                return result;
            }
            String s = ((StringValue) c).getStringValue();
            Matcher m = p.matcher(s);
            if (m.matches()) {
                int max = m.groupCount();
                if (m.groupCount() > newColCount) {
                    errorCounter.incrementAndGet();
                    max = newColCount;
                }
                for (int i = 0; i < max; i++) {
                    // group(0) will return the entire string and is not
                    // included in groupCount, see Matcher API for details
                    String str = m.group(i + 1);
                    if (str != null) {
                        // null for optional groups "(...)?"
                        result[i] = new StringCell(str);
                    }
                }
                return result;
            } else {
                errorCounter.incrementAndGet();
                return result;
            }
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public void afterProcessing() {
            // propagate error count
            internals.getConfig().addInt(CONFIG_KEY_ERRORCOUNT, errorCounter.get());
        }
    });
    return rearranger;
}
Also used : Pattern(java.util.regex.Pattern) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) Matcher(java.util.regex.Matcher) AbstractCellFactory(org.knime.core.data.container.AbstractCellFactory) DataRow(org.knime.core.data.DataRow) DataColumnSpec(org.knime.core.data.DataColumnSpec) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) StringCell(org.knime.core.data.def.StringCell) DataCell(org.knime.core.data.DataCell) StringValue(org.knime.core.data.StringValue)

Aggregations

StringValue (org.knime.core.data.StringValue)33 DataCell (org.knime.core.data.DataCell)25 DataRow (org.knime.core.data.DataRow)22 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)13 DataColumnSpec (org.knime.core.data.DataColumnSpec)10 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)10 DoubleValue (org.knime.core.data.DoubleValue)8 ColumnRearranger (org.knime.core.data.container.ColumnRearranger)8 ArrayList (java.util.ArrayList)7 DataTableSpec (org.knime.core.data.DataTableSpec)7 SingleCellFactory (org.knime.core.data.container.SingleCellFactory)7 StringCell (org.knime.core.data.def.StringCell)7 SettingsModelString (org.knime.core.node.defaultnodesettings.SettingsModelString)6 ParseException (java.text.ParseException)5 DataType (org.knime.core.data.DataType)5 BufferedDataTable (org.knime.core.node.BufferedDataTable)5 PortObject (org.knime.core.node.port.PortObject)5 IOException (java.io.IOException)4 CanceledExecutionException (org.knime.core.node.CanceledExecutionException)4 BitVectorType (org.knime.core.data.vector.bitvector.BitVectorType)3