use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.
the class FileAnalyzer method addQuotes.
/**
* Adds quotes to the settings object. It counts the occurrence of double and single quotes in each line. If it's an
* odd number it will not consider this being a quote (unless it has an odd number of escaped character of this
* type).
*
* @param settings the object to add quote settings to. Must contain file location and possibly comments - but no
* delimiters yet!
* @param exec to check for cancellations and to report progress
* @throws IOException if an I/O error occurs
* @throws InterruptedExecutionException if analysis was interrupted
*/
private static void addQuotes(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
assert settings != null;
assert settings.getAllQuotes().size() == 0;
assert settings.getDataFileLocation() != null;
assert settings.getAllDelimiters().size() == 0;
BufferedFileReader reader = settings.createNewInputReader();
Tokenizer tokenizer = new Tokenizer(reader);
double fileSize = reader.getFileSize();
exec.setProgress("Guessing quotes");
// add '\n' as the only delimiter, so we get one line per token
settings.addDelimiterPattern("\n", true, false, false);
settings.addDelimiterPattern("\r", true, false, false);
tokenizer.setSettings(settings);
// reconstruct original settings.
settings.removeAllDelimiters();
int linesRead = 0;
// by default we support " and ' as quotes both with escape character \
boolean useDoubleQuotes = true;
boolean escapeDoubleQuotes = true;
boolean useSingleQuotes = true;
boolean escapeSingleQuotes = true;
String token;
try {
while (true) {
token = tokenizer.nextToken();
if (token == null) {
// seen end of file.
break;
}
if (token.length() == 0) {
// ignore empty lines
continue;
}
linesRead++;
// cutItShort also checks for interrupt
if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
settings.setAnalyzeUsedAllRows(false);
break;
}
if (cutItShort(exec)) {
exec.setProgress(linesRead / (double) getShortCutLines(exec));
} else if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
}
// Count the number of quote characters. If an odd number
// appears don't support this quote character.
// double quote count
int dq = 0;
// escaped double quotes
int edq = 0;
// single quote count
int sq = 0;
// escaped single quote count
int esq = 0;
boolean esc = false;
for (int c = 0; c < token.length(); c++) {
char ch = token.charAt(c);
if (ch == '\\') {
if (esc) {
// it's a double backslash, leave esc mode
esc = false;
} else {
esc = true;
}
} else {
if (ch == '"') {
if (!esc) {
dq++;
} else {
// previous char was escape char.
edq++;
}
}
if (ch == '\'') {
if (!esc) {
sq++;
} else {
esq++;
}
}
esc = false;
}
}
// now figure out what to do...
if (dq % 2 != 0) {
// odd number of quotes
if (edq % 2 != 0) {
// we can fix that by using the odd number of esc quotes
escapeDoubleQuotes = false;
} else {
// nothing to do but not using double quotes as quotes
useDoubleQuotes = false;
if (!useSingleQuotes) {
// final decision made
break;
}
}
}
if (sq % 2 != 0) {
// odd number of quotes
if (esq % 2 != 0) {
// we can fix that by using the odd number of esc quotes
escapeSingleQuotes = false;
} else {
// nothing to do but not using single quotes as quotes
useSingleQuotes = false;
if (!useDoubleQuotes) {
// final decision made
break;
}
}
}
}
if (useDoubleQuotes) {
if (escapeDoubleQuotes) {
settings.addQuotePattern("\"", "\"", '\\');
} else {
settings.addQuotePattern("\"", "\"");
}
}
if (useSingleQuotes) {
if (escapeSingleQuotes) {
settings.addQuotePattern("'", "'", '\\');
} else {
settings.addQuotePattern("'", "'");
}
}
} finally {
// do this even if analysis is interrupted
tokenizer.closeSourceStream();
}
}
use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.
the class FileAnalyzer method createColumnTypes.
private static ColProperty[] createColumnTypes(final FileReaderNodeSettings userSettings, final FileReaderNodeSettings result, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = result.createNewInputReader();
long fileSize = reader.getFileSize();
exec.setProgress("Guessing column types");
// extract user preset type - if we got any
DataType[] userTypes = new DataType[result.getNumberOfColumns()];
Vector<ColProperty> userColProps = userSettings.getColumnProperties();
if (userColProps != null) {
for (int t = 0; t < userTypes.length; t++) {
if (t >= userColProps.size()) {
break;
}
ColProperty cProp = userColProps.get(t);
if (cProp != null) {
DataColumnSpec cSpec = cProp.getColumnSpec();
if (cSpec != null) {
userTypes[t] = cSpec.getType();
}
}
}
}
DataType[] types = new DataType[result.getNumberOfColumns()];
// if we find a number that can't be parsed,
// we set it as missing value pattern
String[] missValPattern = new String[result.getNumberOfColumns()];
// we can use this missing value pattern only if we also got a real
// value for that same column
boolean[] gotValue = new boolean[result.getNumberOfColumns()];
for (int t = 0; t < types.length; t++) {
// set user type - if set.
if (userTypes[t] != null) {
types[t] = userTypes[t];
} else {
types[t] = IntCell.TYPE;
}
// initialize the data structures:
missValPattern[t] = null;
gotValue[t] = false;
}
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(result);
int linesRead = 0;
int colIdx = -1;
// we create simple cells only, no execContext needed
DataCellFactory cellFactory = new DataCellFactory(null);
cellFactory.setDecimalSeparator(result.getDecimalSeparator());
cellFactory.setThousandsSeparator(result.getThousandsSeparator());
try {
// close the stream on an exception
while (true) {
String token = tokenizer.nextToken();
if (token == null) {
// reached EOF
break;
}
colIdx++;
if (result.getFileHasRowHeaders() && (colIdx == 0) && (!result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted()))) {
// ignore the row header - get the next token/column
token = tokenizer.nextToken();
if (token == null) {
// EOF
break;
}
}
checkInterrupt(exec);
if (result.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
// the file. But if not - what would we do...
if (colIdx > 0) {
// only count not empty lines
linesRead++;
exec.setProgress("Verifying column types");
}
colIdx = -1;
if (cutItShort(exec)) {
if (linesRead >= getShortCutLines(exec)) {
result.setAnalyzeUsedAllRows(false);
break;
}
exec.setProgress(linesRead / (double) getShortCutLines(exec));
} else {
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
}
}
continue;
}
if ((linesRead < 1) && (!userSettings.isFileHasColumnHeadersUserSet() || userSettings.getFileHasColumnHeaders())) {
// unless we know it's not
continue;
}
if (colIdx >= result.getNumberOfColumns()) {
// Ignore the extra columns.
continue;
}
if (userTypes[colIdx] != null) {
// user preset type - nothing to do for us in this column
continue;
}
cellFactory.setMissingValuePattern(missValPattern[colIdx]);
// for numbers we trim tokens and allow empty for missValue
token = token.trim();
if (types[colIdx].isCompatible(IntValue.class)) {
DataCell dc = cellFactory.createDataCellOfType(IntCell.TYPE, token);
if (dc != null) {
gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
continue;
}
// not an integer - could it be the missing value?
if (missValPattern[colIdx] == null) {
// we accept one token that can't be
// parsed per column - but we don't use doubles
// as missing value! Would be odd.
dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
if (dc == null) {
missValPattern[colIdx] = token;
continue;
}
}
// not an integer, not the missing value
// - could be a double
types[colIdx] = DoubleCell.TYPE;
}
if (types[colIdx].isCompatible(DoubleValue.class)) {
DataCell dc = cellFactory.createDataCellOfType(DoubleCell.TYPE, token);
if (dc != null) {
gotValue[colIdx] = gotValue[colIdx] || !dc.isMissing();
continue;
}
// not a double - missing value maybe?
if (missValPattern[colIdx] == null) {
// we accept one token that can't be parsed
// per column as missing value pattern
missValPattern[colIdx] = token;
continue;
}
// not a double, not a missing value,
// lets accept everything: StringCell
types[colIdx] = StringCell.TYPE;
gotValue[colIdx] = true;
}
}
} finally {
tokenizer.closeSourceStream();
}
// set all columns we didn't see any real value for to String.
// Discard any (possible) missing value pattern (that works,
// because we don't accept doubles as missing value patterns).
// Warn the user.
String cols = "";
int cnt = 0;
for (int t = 0; t < types.length; t++) {
if (userTypes[t] == null && !gotValue[t]) {
// do it only for types not set by the user
assert types[t].equals(IntCell.TYPE);
types[t] = StringCell.TYPE;
boolean gotOneVal = missValPattern[t] != null;
missValPattern[t] = null;
if ((cnt < 21) && !gotOneVal && ((userColProps == null) || (userColProps.size() <= t) || (userColProps.get(t) == null) || (!userColProps.get(t).getSkipThisColumn()))) {
if (cnt < 20) {
cols += "#" + t + ", ";
cnt++;
} else if (cnt == 20) {
cols += "...and more..., ";
cnt++;
}
}
}
}
if (cols.length() > 0) {
LOGGER.warn("Didn't get any value for column(s) with index " + // cut off the comma
cols.substring(0, cols.length() - 2) + ". Please verify column type(s).");
}
// pack column types and column missing values in one object
ColProperty[] colPropResult = new ColProperty[types.length];
for (int c = 0; c < colPropResult.length; c++) {
ColProperty cp = new ColProperty();
DataColumnSpecCreator dcsc = new DataColumnSpecCreator("Foo", types[c]);
cp.setColumnSpec(dcsc.createSpec());
if (types[c].equals(StringCell.TYPE)) {
// use the global one, if set, otherwise '?'
if (result.getMissValuePatternStrCols() != null) {
cp.setMissingValuePattern(result.getMissValuePatternStrCols());
} else {
cp.setMissingValuePattern("?");
}
} else {
// for int or double, use the one we figured out (or none)
cp.setMissingValuePattern(missValPattern[c]);
}
colPropResult[c] = cp;
}
return colPropResult;
}
use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.
the class FileAnalyzer method testDelimiterSettingsSetColNum.
/*
* With the new "ignore empty tokens at end of row" option this got a bit
* more complicated: We need to keep a range of numberOfColumns that we can
* accept. The lower bound will be the number of non-empty columns we read
* so far (because this is the minimum all rows must have), the maximum will
* be the non-empty plus empty columns we have seen so far. The reason for
* that is, we may need some of these empty tokens at the end of a row to
* fill the row, in case a later row has more (non-empty) tokens.
*/
private static boolean testDelimiterSettingsSetColNum(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = settings.createNewInputReader();
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(settings);
long fileSize = reader.getFileSize();
int linesRead = 0;
// column counter per line
int columns = 0;
// num of cols with these settings
int numOfCols = -1;
// num of cols incl. some empty tokens at EOR
int maxNumOfCols = -1;
// set it true to use these settings.
boolean useSettings = false;
// consecutive empty tokens read
int consEmptyTokens = 0;
boolean lastTokenWasDelimited = false;
while (true) {
if ((settings.getMaximumNumberOfRowsToRead() > -1) && (linesRead >= settings.getMaximumNumberOfRowsToRead())) {
break;
}
String token = tokenizer.nextToken();
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / (double) fileSize);
}
if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
columns++;
lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
// keep track of the empty tokens read.
if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
consEmptyTokens++;
} else {
consEmptyTokens = 0;
}
} else {
if (columns > 0) {
// ignore empty lines
linesRead++;
try {
if (cutItShort(exec) && (linesRead > getShortCutLines(exec))) {
// cutItShort also checks for interrupts
settings.setAnalyzeUsedAllRows(false);
break;
}
} catch (InterruptedExecutionException iee) {
tokenizer.closeSourceStream();
throw iee;
}
if (token == null && lastTokenWasDelimited) {
columns++;
}
if (linesRead > 1) {
if (numOfCols < 1) {
// for
if (settings.ignoreEmptyTokensAtEndOfRow()) {
// these are the "hard" columns we need
numOfCols = columns - consEmptyTokens;
// we could fill up to this number with empty
// tokens
maxNumOfCols = columns;
if (numOfCols > 1) {
// if we get more than one col settings
// look reasonable
useSettings = true;
}
} else {
numOfCols = columns;
if (numOfCols <= 1) {
// we don't need this delimiter if we put
// everything in one column
useSettings = false;
break;
}
useSettings = true;
}
} else {
if (settings.ignoreEmptyTokensAtEndOfRow()) {
if ((columns - consEmptyTokens) > maxNumOfCols) {
// we read more non-empty columns than we
// could
// fill (in other rows) with empty tokens
useSettings = false;
break;
}
if (columns < numOfCols) {
// even with empty tokens this line has not
// enough columns
useSettings = false;
break;
}
if (columns < maxNumOfCols) {
// "maxNumOfCols" is the maximum number all
// rows can deliver.
maxNumOfCols = columns;
}
if ((columns - consEmptyTokens) > numOfCols) {
// Adjust the number of "hard" columns
numOfCols = columns - consEmptyTokens;
if (numOfCols > 1) {
useSettings = true;
}
}
// cols
assert numOfCols <= maxNumOfCols;
} else {
// cols
if (columns != numOfCols) {
// not good. Getting different number of
// columns in different lines.
useSettings = false;
break;
}
}
}
}
}
consEmptyTokens = 0;
columns = 0;
lastTokenWasDelimited = false;
if (token == null) {
// seen end of file.
break;
}
}
}
tokenizer.closeSourceStream();
if (useSettings) {
settings.setNumberOfColumns(numOfCols);
}
return useSettings;
}
use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.
the class FileAnalyzer method getMaximumNumberOfColumns.
private static int getMaximumNumberOfColumns(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = settings.createNewInputReader();
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(settings);
double fileSize = reader.getFileSize();
// non-empty lines
int dataLinesRead = 0;
// the counter per line
int colCount = 0;
// the maximum
int numOfCols = 0;
// consecutive empty tokens
int consEmptyTokens = 0;
// remember it, in case the last token in the file has no delimiter
boolean lastTokenWasDelimited = false;
try {
while (true) {
String token = tokenizer.nextToken();
if (!settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
colCount++;
lastTokenWasDelimited = tokenizer.lastTokenWasDelimited();
// keep track of the empty tokens read.
if (token.equals("") && !tokenizer.lastTokenWasQuoted()) {
consEmptyTokens++;
} else {
consEmptyTokens = 0;
}
} else {
// null token (=EOF) is a row delimiter
if (colCount > 0) {
// ignore empty lines
dataLinesRead++;
}
if (token == null && colCount < numOfCols && lastTokenWasDelimited) {
// if the last line has no LF, EOF is delimits the last column
colCount++;
}
if (settings.ignoreEmptyTokensAtEndOfRow()) {
// we are looking for the maximum - those empty tokens
// should not contribute to it.
colCount -= consEmptyTokens;
}
if (colCount > numOfCols) {
// we are supposed to return the maximum
numOfCols = colCount;
settings.setColumnNumDeterminingLineNumber(tokenizer.getLineNumber());
}
colCount = 0;
consEmptyTokens = 0;
if (token == null) {
break;
}
if (settings.getMaximumNumberOfRowsToRead() > -1) {
if (tokenizer.getLineNumber() > settings.getSkipFirstLines() + settings.getMaximumNumberOfRowsToRead()) {
break;
}
}
if (cutItShort(exec)) {
// cutItShort also checks for interrupts
if (dataLinesRead >= getShortCutLines(exec)) {
settings.setAnalyzeUsedAllRows(false);
break;
}
exec.setProgress(dataLinesRead / (double) getShortCutLines(exec));
} else {
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
}
}
}
}
} finally {
tokenizer.closeSourceStream();
}
return numOfCols;
}
use of org.knime.base.node.util.BufferedFileReader in project knime-core by knime.
the class FileAnalyzer method checkRowHeader.
/**
* Looks at the first token of each line (except the first line) and returns true if they are all prefixed by the
* same (possibly empty) string followed by a constantly incremented number.
*
* @param settings the file to look at with corresponding settings
* @return true if it's reasonable to assume the file has row headers
* @throws IOException if an I/O error occurs
* @throws InterruptedExecutionException if analysis should be interrupted immediately
*/
private static boolean checkRowHeader(final FileReaderNodeSettings settings, final ExecutionMonitor exec) throws IOException, InterruptedExecutionException {
BufferedFileReader reader = settings.createNewInputReader();
final double fileSize = reader.getFileSize();
long linesRead = 0;
exec.setProgress("Guessing row IDs");
Tokenizer tokenizer = new Tokenizer(reader);
tokenizer.setSettings(settings);
String token;
HeaderHelper helper = null;
boolean firstTokenInRow = true;
try {
while (true) {
token = tokenizer.nextToken();
if (token == null) {
// end of file
break;
}
if (firstTokenInRow && settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
// ignore empty rows
continue;
}
if (firstTokenInRow) {
firstTokenInRow = false;
if (linesRead > 0) {
// we ignore the first line (could be col header line)
if (helper == null) {
// the first row ID we see
helper = HeaderHelper.extractPrefixAndIndexFromHeader(token);
if (helper == null) {
// that's not row header material
return false;
}
} else {
// all other header must match the first one
if (!helper.testNextHeader(token)) {
return false;
}
}
}
} else {
// swallow all tokens except new line delimiters
if (settings.isRowDelimiter(token, tokenizer.lastTokenWasQuoted())) {
// the next token is the first
firstTokenInRow = true;
linesRead++;
if (cutItShort(exec)) {
if (linesRead > getShortCutLines(exec)) {
break;
}
exec.setProgress(linesRead / (double) getShortCutLines(exec));
} else {
if (fileSize > 0) {
exec.setProgress(reader.getNumberOfBytesRead() / fileSize);
}
}
}
}
}
} finally {
tokenizer.closeSourceStream();
}
return true;
}
Aggregations