use of org.knime.core.util.tokenizer.Quote in project knime-core by knime.
the class FileAnalyzerTest method testQuote.
/**
* makes sure double quotes and single quotes are only supported when they
* appear in even numbers.
*/
public void testQuote() {
URL url;
FileReaderNodeSettings settings;
FileReaderNodeSettings analSettings;
try {
/*
* nice quoting
*/
url = initTempFile("\"col1\",'col2',col3,col4\n" + "\"foo\",poo,\"moo\",zoo\n" + "oof,'oo p',oom,' ooz '");
settings = new FileReaderNodeSettings();
settings.setDataFileLocationAndUpdateTableName(url);
analSettings = FileAnalyzer.analyze(settings, null);
assertTrue(analSettings.getFileHasColumnHeaders());
assertEquals(analSettings.getNumberOfColumns(), 4);
Vector<Quote> quotes = analSettings.getAllQuotes();
// we support '"' and '
assertEquals(quotes.size(), 2);
assertEquals((quotes.get(0)).getLeft(), "\"");
assertEquals((quotes.get(0)).getRight(), "\"");
assertTrue((quotes.get(0)).hasEscapeChar());
assertEquals((quotes.get(0)).getEscape(), '\\');
assertEquals((quotes.get(1)).getLeft(), "'");
assertEquals((quotes.get(1)).getRight(), "'");
assertTrue((quotes.get(1)).hasEscapeChar());
assertEquals((quotes.get(1)).getEscape(), '\\');
/*
* the tick (') is part of the data - don't consider it a quote (it
* must show up an odd number of times...)
*/
url = initTempFile("\"col1\",col2,col3,col4\n" + "\"foo\",poo,\"moo\",zoo\n" + "oo'f,o'op,o'om,ooz");
settings = new FileReaderNodeSettings();
settings.setDataFileLocationAndUpdateTableName(url);
analSettings = FileAnalyzer.analyze(settings, null);
assertTrue(analSettings.getFileHasColumnHeaders());
assertEquals(analSettings.getNumberOfColumns(), 4);
quotes = analSettings.getAllQuotes();
// we support '"' still
assertEquals(quotes.size(), 1);
assertEquals(quotes.get(0).getLeft(), "\"");
assertEquals((quotes.get(0)).getRight(), "\"");
assertTrue((quotes.get(0)).hasEscapeChar());
assertEquals((quotes.get(0)).getEscape(), '\\');
/*
* there is also a single double quote in the data
*/
url = initTempFile("\"col1,col2,col3,col4\n" + "fo\"o,poo,moo,zoo\n" + "oo'f,o'op,o'om,ooz");
settings = new FileReaderNodeSettings();
settings.setDataFileLocationAndUpdateTableName(url);
analSettings = FileAnalyzer.analyze(settings, null);
assertEquals(analSettings.getNumberOfColumns(), 4);
assertEquals(analSettings.getAllQuotes().size(), 0);
/*
* don't stumble over escaped quotes
*/
url = initTempFile("col1,col2,col3,col4\n" + "\"foo\",\"po\\\"o\",moo,zoo\n" + "oo'f,o'op,o'om,ooz");
settings = new FileReaderNodeSettings();
settings.setDataFileLocationAndUpdateTableName(url);
analSettings = FileAnalyzer.analyze(settings, null);
assertEquals(analSettings.getNumberOfColumns(), 4);
// we must support the double quotes with the escape char
quotes = analSettings.getAllQuotes();
assertEquals(quotes.size(), 1);
assertEquals((quotes.get(0)).getLeft(), "\"");
assertEquals((quotes.get(0)).getRight(), "\"");
assertTrue((quotes.get(0)).hasEscapeChar());
assertEquals((quotes.get(0)).getEscape(), '\\');
} catch (IOException ioe) {
// if this goes off the temp file couldn't be created.
assertTrue(false);
}
}
use of org.knime.core.util.tokenizer.Quote in project knime-core by knime.
the class QuotePanel method overrideSettings.
/**
* Deletes all quotes defined in the passed object, reads the currently
* listed quotes from the JList and adds them to the settings object.
*
* @param settings the settings object to replace the quotes in with the
* quotes currently defined in the panel
* @return true if the new settings are different from the one passed in.
*/
boolean overrideSettings(final FileReaderNodeSettings settings) {
// save'm to decide whether the new settings are different
Vector<Quote> oldQuotes = settings.getAllQuotes();
settings.removeAllQuotes();
for (int i = 0; i < m_currQuotes.getModel().getSize(); i++) {
String lEntry = (String) m_currQuotes.getModel().getElementAt(i);
String quotes = getQuotePattern(lEntry);
int escChar = getEscCharacter(lEntry);
if (escChar != -1) {
settings.addQuotePattern(quotes, quotes, (char) escChar);
} else {
settings.addQuotePattern(quotes, quotes);
}
}
// fix the settings.
settings.setQuoteUserSet(true);
// LF support
settings.allowLFinQuotes(m_allowLFCheckbox.isSelected());
// decide whether we need to re-analyze the file (whether we have
// new quote settings)
Vector<Quote> newQuotes = settings.getAllQuotes();
if (newQuotes.size() != oldQuotes.size()) {
// need to re-analyze with different quotes.
return true;
}
for (Quote q : newQuotes) {
if (!oldQuotes.contains(q)) {
return true;
}
}
return false;
}
use of org.knime.core.util.tokenizer.Quote in project knime-core by knime.
the class FileAnalyzer method analyze.
/**
* Tries to guess FileReader settings for the passed data file. It will use the settings in the settings object (if
* any - but the file location is required), and will read in the first lines from the file. It will first detect
* comment characters (if the first lines start with '#' or '%'), and then guess the delimiter (',', ';', or space)
* depending on which cuts a line into (more than one) tokens.
*
* @param userSettings containing the URL of the file to examine and settings that should be used and considered
* fixed.
* @param exec used to check for cancellations and to report progress. Could be null. If a
* {@link FileReaderExecutionMonitor} is provided it is distinguished between user cancellations cutting
* the analysis short, and interrupts that return immediately and return null as result.
* @return settings that supposably provide more or less useful results. It will always be a non-null object - but
* may not contain any settings if guessing was just too hard.
* @throws IOException if there was an error reading from the URL
*/
public static FileReaderNodeSettings analyze(final FileReaderNodeSettings userSettings, final ExecutionMonitor exec) throws IOException {
if (userSettings.getDataFileLocation() == null) {
throw new IllegalArgumentException("Must specify a valid file location for the file analyzer");
}
ExecutionMonitor execMon = exec;
if (execMon == null) {
// we create a default exec monitor. Doesn't hurt, because that
// will never be canceled.
execMon = new FileReaderExecutionMonitor();
}
// create the new and empty settings
FileReaderNodeSettings result = new FileReaderNodeSettings();
execMon.setProgress(0.0);
try {
result.setDataFileLocationAndUpdateTableName(userSettings.getDataFileLocation());
result.setDecimalSeparator(userSettings.getDecimalSeparator());
result.setThousandsSeparator(userSettings.getThousandsSeparator());
result.setDecimalSeparatorUserSet(userSettings.decimalSeparatorUserSet());
result.setUniquifyRowIDs(userSettings.uniquifyRowIDs());
result.setMaximumNumberOfRowsToRead(userSettings.getMaximumNumberOfRowsToRead());
result.setSkipFirstLines(userSettings.getSkipFirstLines());
result.allowLFinQuotes(userSettings.allowLFinQuotes());
result.setCharsetName(userSettings.getCharsetName());
result.setAnalyzeUsedAllRows(true);
result.setMissValuePatternStrCols(userSettings.getMissValuePatternStrCols());
result.setConnectTimeout(userSettings.getConnectTimeout());
// if the user didn't provide the charset, identify it by looking at the first bytes of the stream
if (!userSettings.isCharsetUserSet()) {
result.setCharsetName(guessCharSet(userSettings));
result.setCharsetUserSet(false);
} else {
result.setCharsetName(userSettings.getCharsetName());
result.setCharsetUserSet(true);
}
ExecutionMonitor subExec = execMon.createSubProgress(COMMENT_SUB);
if (!userSettings.isCommentUserSet()) {
// only guess comment patterns if user didn't provide any
addComments(result, subExec);
result.setCommentUserSet(false);
} else {
// take over user settings.
for (Comment comment : userSettings.getAllComments()) {
result.addBlockCommentPattern(comment.getBegin(), comment.getEnd(), comment.returnAsSeparateToken(), comment.includeInToken());
}
result.setCommentUserSet(true);
}
subExec.setProgress(1.0);
checkInterrupt(execMon);
subExec = execMon.createSubProgress(QUOTES_SUB);
if (!userSettings.isQuoteUserSet()) {
// only guess quotes if user didn't specify any
addQuotes(result, subExec);
result.setQuoteUserSet(false);
} else {
// take over user settings.
for (Quote quote : userSettings.getAllQuotes()) {
if (quote.hasEscapeChar()) {
result.addQuotePattern(quote.getLeft(), quote.getRight(), quote.getEscape());
} else {
result.addQuotePattern(quote.getLeft(), quote.getRight());
}
}
result.setQuoteUserSet(true);
}
subExec.setProgress(1.0);
checkInterrupt(execMon);
// if user provided whitespace characters, we need to add them.
if (userSettings.isWhiteSpaceUserSet()) {
for (String ws : userSettings.getAllWhiteSpaces()) {
result.addWhiteSpaceCharacter(ws);
}
result.setWhiteSpaceUserSet(true);
} else {
result.addWhiteSpaceCharacter(" ");
result.addWhiteSpaceCharacter("\t");
result.setWhiteSpaceUserSet(false);
}
subExec.setProgress(1.0);
checkInterrupt(execMon);
// for now we just take over this flag:
result.setSupportShortLines(userSettings.getSupportShortLines());
// sets delimiter and column numbers (as many columns as it gets
// with the delimiters - regardless of any row headers);
// honors user settings
subExec = execMon.createSubProgress(DELIMS_SUB);
setDelimitersAndColNum(userSettings, result, subExec);
assert result.getNumberOfColumns() > 0;
subExec.setProgress(1.0);
checkInterrupt(execMon);
// the number of column set as of now does not take into account the
// skipped columns.
subExec = execMon.createSubProgress(ROWHDR_SUB);
if (userSettings.isFileHasRowHeadersUserSet()) {
result.setFileHasRowHeaders(userSettings.getFileHasRowHeaders());
result.setFileHasRowHeadersUserSet(true);
} else {
boolean hasRowHeaders;
if (result.getNumberOfColumns() > 1) {
// if we have at least 2 cols, one of them could be headers
hasRowHeaders = checkRowHeader(result, subExec);
} else {
hasRowHeaders = false;
}
result.setFileHasRowHeaders(hasRowHeaders);
result.setFileHasRowHeadersUserSet(false);
}
subExec.setProgress(1.0);
checkInterrupt(execMon);
// we must correct the column number we've guessed
if (result.getFileHasRowHeaders()) {
result.setNumberOfColumns(result.getNumberOfColumns() - 1);
}
// guesses (or copies) column types and names.
subExec = execMon.createSubProgress(TYPES_SUB + COLHDR_SUB);
Vector<ColProperty> columnProps = createColumnProperties(userSettings, result, subExec);
result.setColumnProperties(columnProps);
subExec.setProgress(1.0);
// set a default row header prefix
if (userSettings.getRowHeaderPrefix() != null) {
result.setRowHeaderPrefix(userSettings.getRowHeaderPrefix());
} else {
result.setRowHeaderPrefix("Row");
}
if (userSettings.isIgnoreEmptyLinesUserSet()) {
result.setIgnoreEmptyLines(userSettings.getIgnoreEmtpyLines());
result.setIgnoreEmptyLinesUserSet(true);
} else {
result.setIgnoreEmptyLines(true);
result.setIgnoreEmptyLinesUserSet(false);
}
execMon.setProgress(1.0);
} catch (InterruptedExecutionException iee) {
return null;
}
return result;
}
use of org.knime.core.util.tokenizer.Quote in project knime-core by knime.
the class QuotePanel method loadSettings.
/**
* Transfers the values from the specified object into the components of the
* panel.
*
* @param settings the object containing the settings to display
*/
private void loadSettings(final FileReaderNodeSettings settings) {
// take over the quotes defined in the settings object into our JList
final Vector<String> newModel = new Vector<String>();
for (Quote q : settings.getAllQuotes()) {
newModel.add(getListEntry(q));
}
m_currQuotes.setModel(new AbstractListModel() {
public int getSize() {
return newModel.size();
}
public Object getElementAt(final int index) {
return newModel.get(index);
}
});
// also clear the edit field
getQEditField().setText("");
clearErrorText();
m_allowLFCheckbox.setSelected(settings.allowLFinQuotes());
}
Aggregations