use of org.knime.core.data.StringValue in project knime-core by knime.
the class CreateBitVectorNodeModel method scanMaxPos.
private int scanMaxPos(final BufferedDataTable data, final ExecutionMonitor exec) throws CanceledExecutionException {
int maxPos = Integer.MIN_VALUE;
int cellIdx = data.getDataTableSpec().findColumnIndex(m_singleColumn.getStringValue());
long nrRows = data.size();
long currRow = 0;
for (DataRow row : data) {
currRow++;
exec.setProgress((double) currRow / (double) nrRows, "processing row " + currRow + " of " + nrRows);
exec.checkCanceled();
DataCell cell = row.getCell(cellIdx);
if (cell.isMissing()) {
continue;
}
if (cell instanceof StringValue) {
final String toParse = ((StringValue) cell).getStringValue();
final String[] numbers = toParse.split("\\s");
for (int i = 0; i < numbers.length; i++) {
int pos = -1;
try {
pos = Integer.parseInt(numbers[i].trim());
maxPos = Math.max(maxPos, pos);
} catch (NumberFormatException nfe) {
// nothing to do here
// same exception will be logged from cell factory
}
}
} else {
throw new RuntimeException("Found incompatible type in row " + row.getKey().getString());
}
}
return maxPos + 1;
}
use of org.knime.core.data.StringValue in project knime-core by knime.
the class CellSplitterCellFactory method createNewColumnTypes.
/**
* Analyzes the values in the user selected column and tries to figure out
* how many columns are needed to hold the splitted values and of which type
* the new resulting column have to be. <br>
* If the "output as list" or "output as set" flag IS set in the settings
* object it returns one as column number, since only one collection cell
* is needed to store the output.
* If the "guess" flag in the settings object is NOT set, it returns the
* column number entered by the user and string type for all columns.
* Otherwise it runs once through the entire table, splits the value of the
* selected column, stores the maximum number of parts received, and tries
* to convert each part into an int (first), then into a double, and if both
* fails it sets string type for the corresponding column.
*
* @param table the table with the column to examine (can be null, if no
* type guessing is required)
* @param userSettings user settings
* @param exec the execution context to set progress and check for cancel
* (can be null)
* @return a settings object containing the same settings as the ones passed
* in and in addition the type (and number) of each column to add
* @throws CanceledExecutionException if user cancels
*/
static CellSplitterSettings createNewColumnTypes(final BufferedDataTable table, final CellSplitterUserSettings userSettings, final ExecutionContext exec) throws CanceledExecutionException {
// make sure we have settings we can deal with
DataTableSpec spec = null;
if (table != null) {
spec = table.getDataTableSpec();
}
String msg = userSettings.getStatus(spec);
if (msg != null) {
// don't call this with invalid settings
assert false;
throw new IllegalStateException(msg);
}
// transfer the user settings into a new settings object (the result)
CellSplitterSettings result;
NodeSettings tmp = new NodeSettings("tmp");
userSettings.saveSettingsTo(tmp);
try {
result = new CellSplitterSettings(tmp);
} catch (InvalidSettingsException ise) {
// the getStatus should have covered any invalidities
throw new IllegalStateException(ise.getMessage());
}
/*
* not guessing types: output as columns
*/
if (!userSettings.isGuessNumOfCols() && userSettings.isOutputAsCols()) {
// we are not supposed to analyze the file.
for (int col = 0; col < userSettings.getNumOfCols(); col++) {
// create as many string columns as the user set
result.addColumnOfType(StringCell.TYPE);
}
return result;
}
/*
* not guessing types: output as list or set
*/
if (userSettings.isOutputAsList() || userSettings.isOutputAsSet()) {
DataType colType = null;
// list cell type
if (userSettings.isOutputAsList()) {
colType = ListCell.getCollectionType(StringCell.TYPE);
// set cell type otherwise (there is no other option left)
} else {
colType = SetCell.getCollectionType(StringCell.TYPE);
}
result.addColumnOfType(colType);
return result;
}
/*
* analyze table
*/
int colIdx = table.getDataTableSpec().findColumnIndex(userSettings.getColumnName());
if (colIdx < 0) {
// the status should have checked this!
assert false;
throw new IllegalStateException("Input table doesn't contain selected column");
}
TokenizerSettings tokenizerSettings = createTokenizerSettings(userSettings);
if (tokenizerSettings == null) {
throw new IllegalStateException("Incorrect user settings");
}
long rowCnt = 0;
long numOfRows = table.size();
for (DataRow row : table) {
rowCnt++;
String inputString = "";
DataCell inputCell = row.getCell(colIdx);
if (inputCell.isMissing()) {
// missing cells don't help determining the target types
continue;
}
if (inputCell instanceof StringValue) {
inputString = ((StringValue) inputCell).getStringValue();
} else {
inputString = inputCell.toString();
}
// init the tokenizer
StringReader inputReader = new StringReader(inputString);
// the reader is no good if it doesn't support the mark operation
assert inputReader.markSupported();
Tokenizer tokenizer = new Tokenizer(inputReader);
tokenizer.setSettings(tokenizerSettings);
int addedColIdx = -1;
// read tokens from the input, analyze the tokens and set the type
while (true) {
String token = tokenizer.nextToken();
addedColIdx++;
if (token == null) {
// done with that input string from that row
break;
}
token = token.trim();
DataType colType = IntCell.TYPE;
// if we already got that many columns, verify the type
if (addedColIdx < result.getNumOfColsGuessed()) {
colType = result.getTypeOfColumn(addedColIdx);
} else {
// otherwise init the type with int
result.addColumnOfType(colType);
}
if (colType.equals(IntCell.TYPE)) {
// try converting it to an integer
try {
Integer.parseInt(token);
} catch (NumberFormatException nfe) {
// that wasn't really an integer. Try double.
colType = DoubleCell.TYPE;
}
}
if (colType.equals(DoubleCell.TYPE)) {
// try converting it to a double
try {
Double.parseDouble(token);
} catch (NumberFormatException nfe) {
// that wasn't really a double. Use string.
colType = StringCell.TYPE;
}
}
// write back the type
result.replaceTypeOfColumn(addedColIdx, colType);
}
if (exec != null) {
exec.checkCanceled();
exec.setProgress((double) rowCnt / (double) numOfRows, "Analyzing row #" + rowCnt + " of " + numOfRows);
}
}
/*
* if the input table contained missing values only, we end up with no
* column to add. Throw an exception.
*/
if (result.getNumOfColsGuessed() < 1) {
throw new IllegalStateException("Data analysis computed no " + "columns to add (happens if input table is empty or " + "has only missing values).\n" + "Please set the array size manually.");
}
return result;
}
use of org.knime.core.data.StringValue in project knime-core by knime.
the class ColumnAutoTypeCasterNodeModel method createDateAndTimeConverter.
private SingleCellFactory createDateAndTimeConverter(final int colIdx, final DataColumnSpec colSpec) {
return new SingleCellFactory(colSpec) {
private final Calendar m_cal = Calendar.getInstance(TimeZone.getDefault());
private final SimpleDateFormat m_format = new SimpleDateFormat(m_dateFormat);
private final boolean m_hasDate;
private final boolean m_hasTime;
private final boolean m_hasMillis;
{
TimeZone timeZone = TimeZone.getTimeZone("UTC");
m_format.setTimeZone(timeZone);
m_cal.setTimeZone(timeZone);
m_hasDate = m_dateFormat.contains("d");
m_hasTime = m_dateFormat.contains("H");
m_hasMillis = m_dateFormat.contains("S");
}
@Override
public DataCell getCell(final DataRow row) {
DataCell cell = row.getCell(colIdx);
if (!cell.isMissing()) {
String str = ((StringValue) cell).getStringValue();
if (!str.equals(m_missValPat)) {
try {
m_cal.setTime(m_format.parse(str));
return new DateAndTimeCell(m_cal.getTimeInMillis(), m_hasDate, m_hasTime, m_hasMillis);
} catch (ParseException e) {
throw new IllegalArgumentException("Can't convert '" + str + "' to " + DateAndTimeCell.TYPE.toString() + ". In " + row.getKey() + " Column" + colIdx + ". Disable quickscan and try again.", e);
}
} else {
return DataType.getMissingCell();
}
} else {
// create MissingCell
return DataType.getMissingCell();
}
}
};
}
use of org.knime.core.data.StringValue in project knime-core by knime.
the class ConditionalBoxPlotNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
m_statistics = new LinkedHashMap<DataColumnSpec, double[]>();
m_mildOutliers = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
m_extremeOutliers = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
double nrRows = inData[0].size();
int rowCount = 0;
int numericIndex = inData[0].getDataTableSpec().findColumnIndex(m_settings.numericColumn());
int nominalIndex = inData[0].getDataTableSpec().findColumnIndex(m_settings.nominalColumn());
Map<String, Map<Double, Set<RowKey>>> data = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
// some default values .. if one column only has missing values.
for (DataCell d : inData[0].getDataTableSpec().getColumnSpec(nominalIndex).getDomain().getValues()) {
String name = ((StringValue) d).getStringValue();
m_mildOutliers.put(name, new HashMap<Double, Set<RowKey>>());
m_extremeOutliers.put(name, new HashMap<Double, Set<RowKey>>());
}
for (DataRow r : inData[0]) {
exec.checkCanceled();
exec.setProgress(rowCount++ / nrRows, "Separating...");
if (!m_settings.showMissingValues()) {
if (r.getCell(nominalIndex).isMissing()) {
// missing cell in nominal values is unwanted?
continue;
}
}
String nominal = replaceSpaces(r.getCell(nominalIndex).toString());
if (r.getCell(numericIndex).isMissing()) {
// ignore missing cells in numeric column
continue;
}
DoubleValue numeric = (DoubleValue) r.getCell(numericIndex);
Map<Double, Set<RowKey>> map = data.get(nominal);
if (map == null) {
map = new LinkedHashMap<Double, Set<RowKey>>();
}
Set<RowKey> set = map.get(numeric.getDoubleValue());
if (set == null) {
set = new HashSet<RowKey>();
}
set.add(r.getKey());
map.put(numeric.getDoubleValue(), set);
data.put(nominal, map);
}
List<String> keys = new ArrayList<String>(data.keySet());
boolean ignoreMissingValues = false;
if (m_settings.showMissingValues() && !keys.contains(DataType.getMissingCell().toString())) {
// we promised to create data for missing values..
// if there aren't any.. we have to create them ourselves
setWarningMessage("No missing values found.");
ignoreMissingValues = true;
}
Collections.sort(keys);
DataColumnSpec[] colSpecs = createColumnSpec(inData[0].getDataTableSpec().getColumnSpec(nominalIndex), ignoreMissingValues);
if (keys.size() == 0) {
setWarningMessage("All classes are empty.");
}
int dataSetNr = 0;
// for (String d : keys) {
for (DataColumnSpec dcs : colSpecs) {
String d = dcs.getName();
if (data.get(d) == null || keys.size() == 0) {
dataSetNr++;
continue;
}
exec.checkCanceled();
exec.setProgress(dataSetNr / (double) keys.size(), "Creating statistics");
Map<Double, Set<RowKey>> extremeOutliers = new LinkedHashMap<Double, Set<RowKey>>();
Map<Double, Set<RowKey>> mildOutliers = new LinkedHashMap<Double, Set<RowKey>>();
double[] stats = calculateStatistic(data.get(d), mildOutliers, extremeOutliers);
double minimum = stats[BoxPlotNodeModel.MIN];
double maximum = stats[BoxPlotNodeModel.MAX];
DataColumnSpecCreator creator = new DataColumnSpecCreator(colSpecs[dataSetNr]);
creator.setDomain(new DataColumnDomainCreator(new DoubleCell(minimum), new DoubleCell(maximum)).createDomain());
colSpecs[dataSetNr] = creator.createSpec();
m_statistics.put(colSpecs[dataSetNr], stats);
m_mildOutliers.put(d, mildOutliers);
m_extremeOutliers.put(d, extremeOutliers);
dataSetNr++;
}
DataTableSpec dts = new DataTableSpec("MyTempTable", colSpecs);
DataContainer cont = new DataContainer(dts);
cont.close();
m_dataArray = new DefaultDataArray(cont.getTable(), 1, 2);
cont.dispose();
if (ignoreMissingValues) {
DataColumnSpec[] temp = new DataColumnSpec[colSpecs.length + 1];
DataColumnSpec missing = new DataColumnSpecCreator(DataType.getMissingCell().toString(), DataType.getMissingCell().getType()).createSpec();
int i = 0;
while (missing.getName().compareTo(colSpecs[i].getName()) > 0) {
temp[i] = colSpecs[i];
i++;
}
temp[i++] = missing;
while (i < temp.length) {
temp[i] = colSpecs[i - 1];
i++;
}
colSpecs = temp;
}
/* Save inSpec of the numeric column to provide the view a way to
* consider the input domain for normalization. */
m_numColSpec = inData[0].getDataTableSpec().getColumnSpec(numericIndex);
return new BufferedDataTable[] { createOutputTable(inData[0].getDataTableSpec(), colSpecs, exec).getTable() };
}
use of org.knime.core.data.StringValue in project knime-core by knime.
the class RegexSplitNodeModel method createColumnRearranger.
@Override
protected ColumnRearranger createColumnRearranger(final DataTableSpec spec, final SimpleStreamableOperatorInternals internals) throws InvalidSettingsException {
AtomicInteger errorCounter = new AtomicInteger();
if (m_settings == null) {
throw new InvalidSettingsException("Not configuration available.");
}
final int colIndex = spec.findColumnIndex(m_settings.getColumn());
if (colIndex < 0) {
throw new InvalidSettingsException("No such column in input table: " + m_settings.getColumn());
}
DataColumnSpec colSpec = spec.getColumnSpec(colIndex);
if (!colSpec.getType().isCompatible(StringValue.class)) {
throw new InvalidSettingsException("Selected column does not " + "contain strings");
}
final Pattern p = m_settings.compile();
int count = 0;
String patternS = p.pattern();
boolean isNextSpecial = false;
boolean isPreviousAParenthesis = false;
// escaped parentheses "\(" or non-capturing groups "(?"
for (int i = 0; i < patternS.length(); i++) {
switch(patternS.charAt(i)) {
case '\\':
isNextSpecial = !isNextSpecial;
isPreviousAParenthesis = false;
break;
case '(':
count += isNextSpecial ? 0 : 1;
isPreviousAParenthesis = !isNextSpecial;
isNextSpecial = false;
break;
case '?':
if (isPreviousAParenthesis) {
count -= 1;
}
// no break;
default:
isNextSpecial = false;
isPreviousAParenthesis = false;
}
}
final int newColCount = count;
final DataColumnSpec[] newColSpecs = new DataColumnSpec[count];
for (int i = 0; i < newColCount; i++) {
String name = DataTableSpec.getUniqueColumnName(spec, "split_" + i);
newColSpecs[i] = new DataColumnSpecCreator(name, StringCell.TYPE).createSpec();
}
ColumnRearranger rearranger = new ColumnRearranger(spec);
rearranger.append(new AbstractCellFactory(newColSpecs) {
/**
* {@inheritDoc}
*/
@Override
public DataCell[] getCells(final DataRow row) {
DataCell[] result = new DataCell[newColCount];
Arrays.fill(result, DataType.getMissingCell());
DataCell c = row.getCell(colIndex);
if (c.isMissing()) {
return result;
}
String s = ((StringValue) c).getStringValue();
Matcher m = p.matcher(s);
if (m.matches()) {
int max = m.groupCount();
if (m.groupCount() > newColCount) {
errorCounter.incrementAndGet();
max = newColCount;
}
for (int i = 0; i < max; i++) {
// group(0) will return the entire string and is not
// included in groupCount, see Matcher API for details
String str = m.group(i + 1);
if (str != null) {
// null for optional groups "(...)?"
result[i] = new StringCell(str);
}
}
return result;
} else {
errorCounter.incrementAndGet();
return result;
}
}
/**
* {@inheritDoc}
*/
@Override
public void afterProcessing() {
// propagate error count
internals.getConfig().addInt(CONFIG_KEY_ERRORCOUNT, errorCounter.get());
}
});
return rearranger;
}
Aggregations