use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.
the class ARFFTable method extractNominalVals.
/*
* expects the list of nominal values (in curely braces and comma separated)
* from the "@attribute" line to be next in the tokenizer (including the
* beginning of the list with the iopening brace). Will return an array of
* StringsCells with the different values extracted (and removed) from the
* tokenizer. It will leave the EOL at the end of the list in the tokenizer.
* Pass in also file name for nice error messages.
*/
private static DataCell[] extractNominalVals(final String valList, final String fileName, final int lineNo) throws InvalidSettingsException {
Collection<DataCell> vals = new LinkedHashSet<DataCell>();
// we must support quotes and stuff - let's use another tokenizer.
StringReader strReader = new StringReader(valList);
Tokenizer tokizer = new Tokenizer(strReader);
TokenizerSettings tokSets = new TokenizerSettings();
tokSets.addDelimiterPattern(",", false, false, false);
tokSets.addQuotePattern("'", "'");
tokSets.addQuotePattern("\"", "\"");
tokizer.setSettings(tokSets);
for (String val = tokizer.nextToken(); val != null; val = tokizer.nextToken()) {
String newval = val;
// trimm off any whitespaces.
if (!tokizer.lastTokenWasQuoted()) {
newval = val.trim();
}
// make sure we don't add the same value twice.
StringCell newValCell = new StringCell(newval);
if (!vals.contains(newValCell)) {
vals.add(newValCell);
} else {
LOGGER.warn("ARFF reader WARNING: The list of nominal " + "values in the header of file '" + fileName + "' line " + lineNo + " contains the value '" + newval + "' twice. Ignoring one appearance.");
}
}
return vals.toArray(new DataCell[vals.size()]);
}
use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.
the class ARFFTable method createDataTableSpecFromARFFfile.
/**
* Reads in the header of the specified ARFF file and returns a
* corresponding table spec object.
*
* @param fileLoc the location of the ARFF file to read
* @param exec to enable users to cancel this process
* @return a table spec reflecting the settings in the file header
* @throws IOException if the file location couldn't be opened
* @throws InvalidSettingsException if the file contains an invalid format
* @throws CanceledExecutionException if user canceled
*/
public static DataTableSpec createDataTableSpecFromARFFfile(final URL fileLoc, final ExecutionMonitor exec) throws IOException, InvalidSettingsException, CanceledExecutionException {
// create a tokenizer to read the header
InputStream inStream = FileUtil.openStreamWithTimeout(fileLoc);
Tokenizer tokenizer = new Tokenizer(new BufferedReader(new InputStreamReader(inStream)));
// create tokenizer settings that will deliver us the attributes and
// arguments as tokens.
tokenizer.setSettings(getTokenizerHeaderSettings());
// prepare for creating a column spec for each "@attribute" read
Vector<DataColumnSpec> colSpecs = new Vector<DataColumnSpec>();
String tableName = null;
String token;
// the data section begins.
while (true) {
if (exec != null) {
// throws exception if user canceled.
exec.checkCanceled();
}
DataCell[] possVals = null;
DataType type;
token = tokenizer.nextToken();
if (token == null) {
throw new InvalidSettingsException("Incorrect/Incomplete " + "ARFF file. No data section found.");
}
if (token.length() == 0) {
// ignore empty lines
continue;
}
if (token.equalsIgnoreCase("@DATA")) {
// this starts the data section: we are done.
break;
}
if (token.equalsIgnoreCase("@ATTRIBUTE")) {
// defines a new data column
String colName = tokenizer.nextToken();
String colType = null;
if (tokenizer.lastTokenWasQuoted() && tokenizer.getLastQuoteBeginPattern().equals("{")) {
// name. Extract it from there and set it in the 'colType'
if (colName.charAt(0) == '{') {
// seems we only got a value list.
// The col name must be empty/missing then...
colType = colName;
colName = null;
} else {
int openBraceIdx = colName.indexOf('{');
int closeBraceIdx = colName.lastIndexOf('}');
colType = colName.substring(openBraceIdx + 1, closeBraceIdx);
colName = colName.substring(0, openBraceIdx);
// we ignore everything after the nominal value list
}
} else {
colType = tokenizer.nextToken();
}
if ((colName == null) || (colType == null)) {
throw new InvalidSettingsException("Incomplete '@attribute' statement at line " + tokenizer.getLineNumber() + " in ARFF file '" + fileLoc + "'.");
}
// start the 'if' thing here.
if (colType.equalsIgnoreCase("NUMERIC") || colType.equalsIgnoreCase("REAL")) {
type = DoubleCell.TYPE;
// ignore whatever still comes in that line, warn though
readUntilEOL(tokenizer, fileLoc.toString());
} else if (colType.equalsIgnoreCase("INTEGER")) {
type = IntCell.TYPE;
// ignore whatever still comes in that line, warn though
readUntilEOL(tokenizer, fileLoc.toString());
} else if (colType.equalsIgnoreCase("STRING")) {
type = StringCell.TYPE;
// ignore whatever still comes in that line, warn though
readUntilEOL(tokenizer, fileLoc.toString());
} else if (colType.equalsIgnoreCase("DATE")) {
// we use string cell for date ...
type = StringCell.TYPE;
// ignore whatever date format is specified
readUntilEOL(tokenizer, null);
} else if (tokenizer.lastTokenWasQuoted() && tokenizer.getLastQuoteBeginPattern().equals("{")) {
// the braces should be still in the string
int openBraceIdx = colType.indexOf('{');
int closeBraceIdx = colType.lastIndexOf('}');
if ((openBraceIdx >= 0) && (closeBraceIdx > 0) && (openBraceIdx < closeBraceIdx)) {
colType = colType.substring(openBraceIdx + 1, closeBraceIdx);
}
// the type was a list of nominal values
possVals = extractNominalVals(colType, fileLoc.toString(), tokenizer.getLineNumber());
// KNIME uses string cells for nominal values.
type = StringCell.TYPE;
readUntilEOL(tokenizer, fileLoc.toString());
} else {
throw new InvalidSettingsException("Invalid column type" + " '" + colType + "' in attribute control " + "statement in ARFF file '" + fileLoc + "' at line " + tokenizer.getLineNumber() + ".");
}
DataColumnSpecCreator dcsc = new DataColumnSpecCreator(colName, type);
if (possVals != null) {
dcsc.setDomain(new DataColumnDomainCreator(possVals).createDomain());
}
colSpecs.add(dcsc.createSpec());
} else if (token.equalsIgnoreCase("@RELATION")) {
tableName = tokenizer.nextToken();
if (tableName == null) {
throw new InvalidSettingsException("Incomplete '@relation' statement at line " + tokenizer.getLineNumber() + " in ARFF file '" + fileLoc + "'.");
}
// we just ignore the name of the data set.
readUntilEOL(tokenizer, null);
} else if (token.charAt(0) == '@') {
// OOps. What's that?!?
LOGGER.warn("ARFF reader WARNING: Unsupported control " + "statement '" + token + "' in line " + tokenizer.getLineNumber() + ". Ignoring it! File: " + fileLoc);
readUntilEOL(tokenizer, null);
} else if (!token.equals("\n")) {
LOGGER.warn("ARFF reader WARNING: Unsupported " + "statement '" + token + "' in header of ARFF file '" + fileLoc + "', line " + tokenizer.getLineNumber() + ". Ignoring it!");
readUntilEOL(tokenizer, null);
}
// else ignore empty lines
}
// end of while (not EOF)
// check uniqueness of column names
HashSet<String> colNames = new HashSet<>();
for (int c = 0; c < colSpecs.size(); c++) {
if (!colNames.add(colSpecs.get(c).getName())) {
throw new InvalidSettingsException("Two attributes with equal names defined in header of file '" + fileLoc + "'.");
}
}
return new DataTableSpec(tableName, colSpecs.toArray(new DataColumnSpec[colSpecs.size()]));
}
use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.
the class FileAnalyzer method testDelimiterSettingsSetColNum.
/*
* With the new "ignore empty tokens at end of row" option this got a bit
* more complicated: We need to keep a range of numberOfColumns that we can
* accept. The lower bound will be the number of non-empty columns we read
* so far (because this is the minimum all rows must have), the maximum will
* be the non-empty plus empty columns we have seen so far. The reason for
* that is, we may need some of these empty tokens at the end of a row to
* fill the row, in case a later row has more (non-empty) tokens.
*/
private static boolean testDelimiterSettingsSetColNum(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = settings.createNewInputReader();
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(settings);
long fileSize = reader.getFileSize();
int linesRead = 0;
// column counter per line
int columns = 0;
// num of cols with these settings
int numOfCols = -1;
// num of cols incl. some empty tokens at EOR
int maxNumOfCols = -1;
// set it true to use these settings.
boolean useSettings = false;
// consecutive empty tokens read
int consEmptyTokens = 0;
boolean lastTokenWasDelimited = false;
while (true) {
if ((settings.getMaximumNumberOfRowsToRead() > -1) && (linesRead >= settings.getMaximumNumberOfRowsToRead())) {
break;
}
String token = tokenizer.nextToken();
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
}
if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
columns++;
lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
// keep track of the empty tokens read.
if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
consEmptyTokens++;
} else {
consEmptyTokens = 0;
}
} else {
if (columns > 0) {
// ignore empty lines
linesRead++;
try {
if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
// cutItShort also checks for interrupts
settings.setAnalyzeUsedAllRows(false);
break;
}
} catch (InterruptedExecutionException iee) {
tokenizer.closeSourceStream();
throw iee;
}
if (token == null && lastTokenWasDelimited) {
columns++;
}
if (linesRead > 1) {
if (numOfCols < 1) {
// for
if (settings.ignoreEmptyTokensAtEndOfRow()) {
// these are the "hard" columns we need
numOfCols = columns - consEmptyTokens;
// we could fill up to this number with empty
// tokens
maxNumOfCols = columns;
if (numOfCols > 1) {
// if we get more than one col settings
// look reasonable
useSettings = true;
}
} else {
numOfCols = columns;
if (numOfCols <= 1) {
// we don't need this delimiter if we put
// everything in one column
useSettings = false;
break;
}
useSettings = true;
}
} else {
if (settings.ignoreEmptyTokensAtEndOfRow()) {
if ((columns - consEmptyTokens) > maxNumOfCols) {
// we read more non-empty columns than we
// could
// fill (in other rows) with empty tokens
useSettings = false;
break;
}
if (columns < numOfCols) {
// even with empty tokens this line has not
// enough columns
useSettings = false;
break;
}
if (columns < maxNumOfCols) {
// "maxNumOfCols" is the maximum number all
// rows can deliver.
maxNumOfCols = columns;
}
if ((columns - consEmptyTokens) > numOfCols) {
// Adjust the number of "hard" columns
numOfCols = columns - consEmptyTokens;
if (numOfCols > 1) {
useSettings = true;
}
}
// cols
assert numOfCols <= maxNumOfCols;
} else {
// cols
if (columns != numOfCols) {
// not good. Getting different number of
// columns in different lines.
useSettings = false;
break;
}
}
}
}
}
consEmptyTokens = 0;
columns = 0;
lastTokenWasDelimited = false;
if (token == null) {
// seen end of file.
break;
}
}
}
tokenizer.closeSourceStream();
if (useSettings) {
settings.setNumberOfColumns(numOfCols);
}
return useSettings;
}
use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.
the class FileAnalyzer method getMaximumNumberOfColumns.
private static int getMaximumNumberOfColumns(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = settings.createNewInputReader();
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(settings);
double fileSize = reader.getFileSize();
// non-empty lines
int dataLinesRead = 0;
// the counter per line
int colCount = 0;
// the maximum
int numOfCols = 0;
// consecutive empty tokens
int consEmptyTokens = 0;
// remember it, in case the last token in the file has no delimiter
boolean lastTokenWasDelimited = false;
try {
while (true) {
String token = tokenizer.nextToken();
if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
colCount++;
lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
// keep track of the empty tokens read.
if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
consEmptyTokens++;
} else {
consEmptyTokens = 0;
}
} else {
// null token (=EOF) is a row delimiter
if (colCount > 0) {
// ignore empty lines
dataLinesRead++;
}
if (token == null && colCount < numOfCols && lastTokenWasDelimited) {
// if the last line has no LF, EOF is delimits the last column
colCount++;
}
if (settings.ignoreEmptyTokensAtEndOfRow()) {
// we are looking for the maximum - those empty tokens
// should not contribute to it.
colCount -= consEmptyTokens;
}
if (colCount > numOfCols) {
// we are supposed to return the maximum
numOfCols = colCount;
settings.setColumnNumDeterminingLineNumber(tokenizer.getLineNumber());
}
colCount = 0;
consEmptyTokens = 0;
if (token == null) {
break;
}
if (settings.getMaximumNumberOfRowsToRead() > -1) {
if (tokenizer.getLineNumber() > settings.getSkipFirstLines() + settings.getMaximumNumberOfRowsToRead()) {
break;
}
}
if (cutItShort(exec)) {
// cutItShort also checks for interrupts
if (dataLinesRead >= getShortCutLines(exec)) {
settings.setAnalyzeUsedAllRows(false);
break;
}
exec.setProgress(dataLinesRead / (double) getShortCutLines(exec));
} else {
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
}
}
}
}
} finally {
tokenizer.closeSourceStream();
}
return numOfCols;
}
use of org.knime.core.util.tokenizer.Tokenizer in project knime-core by knime.
the class FileAnalyzer method checkRowHeader.
/**
* Looks at the first token of each line (except the first line) and returns true if they are all prefixed by the
* same (possibly empty) string followed by a constantly incremented number.
*
* @param settings the file to look at with corresponding settings
* @return true if it's reasonable to assume the file has row headers
* @throws IOException if an I/O error occurs
* @throws InterruptedExecutionException if analysis should be interrupted immediately
*/
private static boolean checkRowHeader(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = settings.createNewInputReader();
final double fileSize = reader.getFileSize();
long linesRead = 0;
exec.setProgress("Guessing row IDs");
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(settings);
String token;
HeaderHelper helper = null;
boolean firstTokenInRow = true;
try {
while (true) {
token = tokenizer.nextToken();
if (token == null) {
// end of file
break;
}
if (firstTokenInRow && settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
// ignore empty rows
continue;
}
if (firstTokenInRow) {
firstTokenInRow = false;
if (linesRead > 0) {
// we ignore the first line (could be col header line)
if (helper == null) {
// the first row ID we see
helper = HeaderHelper.extractPrefixAndIndexFromHeader(token);
if (helper == null) {
// that's not row header material
return false;
}
} else {
// all other header must match the first one
if (!helper.testNextHeader(token)) {
return false;
}
}
}
} else {
// swallow all tokens except new line delimiters
if (settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
// the next token is the first
firstTokenInRow = true;
linesRead++;
if (cutItShort(exec)) {
if (linesRead > getShortCutLines(exec)) {
break;
}
exec.setProgress(linesRead / (double) getShortCutLines(exec));
} else {
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
}
}
}
}
}
} finally {
tokenizer.closeSourceStream();
}
return true;
}
Aggregations