use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.
the class CellSplitterCellFactory method createNewColumnTypes.
/**
* Analyzes the values in the user selected column and tries to figure out
* how many columns are needed to hold the splitted values and of which type
* the new resulting column have to be. <br>
* If the "output as list" or "output as set" flag IS set in the settings
* object it returns one as column number, since only one collection cell
* is needed to store the output.
* If the "guess" flag in the settings object is NOT set, it returns the
* column number entered by the user and string type for all columns.
* Otherwise it runs once through the entire table, splits the value of the
* selected column, stores the maximum number of parts received, and tries
* to convert each part into an int (first), then into a double, and if both
* fails it sets string type for the corresponding column.
*
* @param table the table with the column to examine (can be null, if no
* type guessing is required)
* @param userSettings user settings
* @param exec the execution context to set progress and check for cancel
* (can be null)
* @return a settings object containing the same settings as the ones passed
* in and in addition the type (and number) of each column to add
* @throws CanceledExecutionException if user cancels
*/
static CellSplitterSettings createNewColumnTypes(final BufferedDataTable table, final CellSplitterUserSettings userSettings, final ExecutionContext exec) throws CanceledExecutionException {
// make sure we have settings we can deal with
DataTableSpec spec = null;
if (table != null) {
spec = table.getDataTableSpec();
}
String msg = userSettings.getStatus(spec);
if (msg != null) {
// don't call this with invalid settings
assert false;
throw new IllegalStateException(msg);
}
// transfer the user settings into a new settings object (the result)
CellSplitterSettings result;
NodeSettings tmp = new NodeSettings("tmp");
userSettings.saveSettingsTo(tmp);
try {
result = new CellSplitterSettings(tmp);
} catch (InvalidSettingsException ise) {
// the getStatus should have covered any invalidities
throw new IllegalStateException(ise.getMessage());
}
/*
* not guessing types: output as columns
*/
if (!userSettings.isGuessNumOfCols() && userSettings.isOutputAsCols()) {
// we are not supposed to analyze the file.
for (int col = 0; col < userSettings.getNumOfCols(); col++) {
// create as many string columns as the user set
result.addColumnOfType(StringCell.TYPE);
}
return result;
}
/*
* not guessing types: output as list or set
*/
if (userSettings.isOutputAsList() || userSettings.isOutputAsSet()) {
DataType colType = null;
// list cell type
if (userSettings.isOutputAsList()) {
colType = ListCell.getCollectionType(StringCell.TYPE);
// set cell type otherwise (there is no other option left)
} else {
colType = SetCell.getCollectionType(StringCell.TYPE);
}
result.addColumnOfType(colType);
return result;
}
/*
* analyze table
*/
int colIdx = table.getDataTableSpec().findColumnIndex(userSettings.getColumnName());
if (colIdx < 0) {
// the status should have checked this!
assert false;
throw new IllegalStateException("Input table doesn't contain selected column");
}
TokenizerSettings tokenizerSettings = createTokenizerSettings(userSettings);
if (tokenizerSettings == null) {
throw new IllegalStateException("Incorrect user settings");
}
long rowCnt = 0;
long numOfRows = table.size();
for (DataRow row : table) {
rowCnt++;
String inputString = "";
DataCell inputCell = row.getCell(colIdx);
if (inputCell.isMissing()) {
// missing cells don't help determining the target types
continue;
}
if (inputCell instanceof StringValue) {
inputString = ((StringValue) inputCell).getStringValue();
} else {
inputString = inputCell.toString();
}
// init the tokenizer
StringReader inputReader = new StringReader(inputString);
// the reader is no good if it doesn't support the mark operation
assert inputReader.markSupported();
Tokenizer tokenizer = new Tokenizer(inputReader);
tokenizer.setSettings(tokenizerSettings);
int addedColIdx = -1;
// read tokens from the input, analyze the tokens and set the type
while (true) {
String token = tokenizer.nextToken();
addedColIdx++;
if (token == null) {
// done with that input string from that row
break;
}
token = token.trim();
DataType colType = IntCell.TYPE;
// if we already got that many columns, verify the type
if (addedColIdx < result.getNumOfColsGuessed()) {
colType = result.getTypeOfColumn(addedColIdx);
} else {
// otherwise init the type with int
result.addColumnOfType(colType);
}
if (colType.equals(IntCell.TYPE)) {
// try converting it to an integer
try {
Integer.parseInt(token);
} catch (NumberFormatException nfe) {
// that wasn't really an integer. Try double.
colType = DoubleCell.TYPE;
}
}
if (colType.equals(DoubleCell.TYPE)) {
// try converting it to a double
try {
Double.parseDouble(token);
} catch (NumberFormatException nfe) {
// that wasn't really a double. Use string.
colType = StringCell.TYPE;
}
}
// write back the type
result.replaceTypeOfColumn(addedColIdx, colType);
}
if (exec != null) {
exec.checkCanceled();
exec.setProgress((double) rowCnt / (double) numOfRows, "Analyzing row #" + rowCnt + " of " + numOfRows);
}
}
/*
* if the input table contained missing values only, we end up with no
* column to add. Throw an exception.
*/
if (result.getNumOfColsGuessed() < 1) {
throw new IllegalStateException("Data analysis computed no " + "columns to add (happens if input table is empty or " + "has only missing values).\n" + "Please set the array size manually.");
}
return result;
}
use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.
the class CellSplitterCellFactory method createTokenizerSettings.
private static TokenizerSettings createTokenizerSettings(final CellSplitterUserSettings userSettings) {
if (userSettings == null) {
return null;
}
if ((userSettings.getDelimiter() == null) || (userSettings.getDelimiter().length() == 0)) {
return null;
}
TokenizerSettings result = new TokenizerSettings();
String delim = userSettings.getDelimiter();
if (userSettings.isUseEscapeCharacter()) {
delim = StringEscapeUtils.unescapeJava(delim);
}
result.addDelimiterPattern(delim, /* combineConsecutive */
false, /* returnAsSeperateToken */
false, /* includeInToken */
false);
String quote = userSettings.getQuotePattern();
if ((quote != null) && (quote.length() > 0)) {
result.addQuotePattern(quote, quote, '\\', userSettings.isRemoveQuotes());
}
return result;
}
use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.
the class ARFFTable method getTokenizerHeaderSettings.
// createDataTableSpecFromARFFfile(URL)
/*
* returns a settings object used to read the ARFF file header.
*/
private static TokenizerSettings getTokenizerHeaderSettings() {
TokenizerSettings settings = new TokenizerSettings();
// add the ARFF single line comment
settings.addSingleLineCommentPattern("%", false, false);
// LF is a row seperator - add it as delimiter
settings.addDelimiterPattern("\n", /* combine multiple= */
true, /* return as token= */
true, /* include in token= */
false);
// ARFF knows single and double quotes
settings.addQuotePattern("'", "'");
settings.addQuotePattern("\"", "\"");
// the nominal values list will be quoted into one token (but the
// braces must stay in)
settings.addQuotePattern("{", "}", true);
// the attribute statement and arguments are separated by space(s)
settings.addDelimiterPattern(" ", true, false, false);
// or tabs
settings.addDelimiterPattern("\t", true, false, false);
// and a combination of them
settings.setCombineMultipleDelimiters(true);
return settings;
}
use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.
the class ARFFTable method extractNominalVals.
/*
* expects the list of nominal values (in curely braces and comma separated)
* from the "@attribute" line to be next in the tokenizer (including the
* beginning of the list with the iopening brace). Will return an array of
* StringsCells with the different values extracted (and removed) from the
* tokenizer. It will leave the EOL at the end of the list in the tokenizer.
* Pass in also file name for nice error messages.
*/
private static DataCell[] extractNominalVals(final String valList, final String fileName, final int lineNo) throws InvalidSettingsException {
Collection<DataCell> vals = new LinkedHashSet<DataCell>();
// we must support quotes and stuff - let's use another tokenizer.
StringReader strReader = new StringReader(valList);
Tokenizer tokizer = new Tokenizer(strReader);
TokenizerSettings tokSets = new TokenizerSettings();
tokSets.addDelimiterPattern(",", false, false, false);
tokSets.addQuotePattern("'", "'");
tokSets.addQuotePattern("\"", "\"");
tokizer.setSettings(tokSets);
for (String val = tokizer.nextToken(); val != null; val = tokizer.nextToken()) {
String newval = val;
// trimm off any whitespaces.
if (!tokizer.lastTokenWasQuoted()) {
newval = val.trim();
}
// make sure we don't add the same value twice.
StringCell newValCell = new StringCell(newval);
if (!vals.contains(newValCell)) {
vals.add(newValCell);
} else {
LOGGER.warn("ARFF reader WARNING: The list of nominal " + "values in the header of file '" + fileName + "' line " + lineNo + " contains the value '" + newval + "' twice. Ignoring one appearance.");
}
}
return vals.toArray(new DataCell[vals.size()]);
}
use of org.knime.core.util.tokenizer.TokenizerSettings in project knime-core by knime.
the class BatchExecutor method splitWorkflowVariableArg.
/**
* Splits the argument to -workflow.variable into its sub-components (name, value, type) and returns it as array.
*
* @param arg The string to split
* @return The components of the string, no validation is done.
* @since 2.11
*/
public static String[] splitWorkflowVariableArg(final String arg) {
Tokenizer tokenizer = new Tokenizer(new StringReader(arg));
TokenizerSettings settings = new TokenizerSettings();
settings.addQuotePattern("\"", "\"", '\\');
settings.addQuotePattern("'", "'", '\\');
settings.addDelimiterPattern(",", /* combine multiple= */
false, /* return as token= */
false, /* include in token= */
false);
tokenizer.setSettings(settings);
ArrayList<String> tokenList = new ArrayList<String>();
String token;
while ((token = tokenizer.nextToken()) != null) {
tokenList.add(token);
}
return tokenList.toArray(new String[tokenList.size()]);
}
Aggregations