use of org.supercsv.exception.SuperCsvException in project voltdb by VoltDB.
the class Tokenizer method readColumns.
/**
* {@inheritDoc}
*/
@Override
public boolean readColumns(final List<String> columns) throws IOException {
if (columns == null) {
throw new NullPointerException("columns should not be null");
}
// clear the reusable List and StringBuilders
columns.clear();
currentColumn.setLength(0);
currentRow.setLength(0);
// keep reading lines until data is found
String line;
do {
line = readLine();
if (line == null) {
// EOF
return false;
}
} while (line.length() == 0 || line.trim().isEmpty() || (commentMatcher != null && commentMatcher.isComment(line)) || (!header && skip >= getLineNumber()));
if (header) {
header = false;
skip++;
}
// update the untokenized CSV row
currentRow.append(line);
// add a newline to determine end of line (making parsing easier)
line += NEWLINE;
// process each character in the line, catering for surrounding quotes (QUOTE_MODE)
TokenizerState state = TokenizerState.NORMAL;
// the line number where a potential multi-line cell starts
int quoteScopeStartingLine = -1;
// keep track of spaces (so leading/trailing space can be removed if required)
int potentialSpaces = 0;
int charIndex = 0;
boolean espectQuote = this.strictQuotes;
boolean isEscape = false;
boolean sawNewLineInQuote = false;
while (true) {
final char c = line.charAt(charIndex);
if (TokenizerState.NORMAL.equals(state)) {
/*
* NORMAL mode (not within quotes).
*/
if (isEscape) {
/*
* Just a normal character. Add any required spaces (but trim any leading spaces if surrounding
* spaces need quotes), add the character, then continue to next character.
*/
isEscape = false;
if (this.strictQuotes && espectQuote) {
throw new SuperCsvException(String.format("strictQuotes: quotes needed at line %d column %d. To proceed, " + "either quote the column or remove --strictquotes", getLineNumber(), columns.size() + 1));
}
if (!surroundingSpacesNeedQuotes || currentColumn.length() > 0) {
appendSpaces(currentColumn, potentialSpaces);
}
potentialSpaces = 0;
currentColumn.append(c);
} else if (c == escapeChar && !(line.charAt(charIndex + 1) == 'N')) {
isEscape = true;
} else if (c == delimeterChar) {
/*
* Delimiter. Save the column (trim trailing space if required) then continue to next character.
*/
espectQuote = true;
if (!surroundingSpacesNeedQuotes) {
appendSpaces(currentColumn, potentialSpaces);
}
// "" -> null
columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null);
potentialSpaces = 0;
currentColumn.setLength(0);
} else if (c == SPACE) {
/*
* Space. Remember it, then continue to next character.
*/
potentialSpaces++;
} else if (c == NEWLINE) {
/*
* Newline. Add any required spaces (if surrounding spaces don't need quotes) and return (we've read
* a line!).
*/
if (!surroundingSpacesNeedQuotes) {
appendSpaces(currentColumn, potentialSpaces);
}
// "" -> null
columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null);
return true;
} else if (c == quoteChar) {
/*
* A single quote ("). Update to QUOTESCOPE (but don't save quote), then continue to next character.
*/
espectQuote = false;
state = TokenizerState.QUOTE_MODE;
quoteScopeStartingLine = getLineNumber();
// cater for spaces before a quoted section (be lenient!)
if (!surroundingSpacesNeedQuotes || currentColumn.length() > 0) {
appendSpaces(currentColumn, potentialSpaces);
}
potentialSpaces = 0;
} else {
/*
* Just a normal character. Add any required spaces (but trim any leading spaces if surrounding
* spaces need quotes), add the character, then continue to next character.
*/
if (this.strictQuotes && espectQuote) {
throw new SuperCsvException(String.format("strictQuotes: quotes needed at line %d column %d. To proceed, " + "either quote the column or remove --strictquotes", getLineNumber(), columns.size() + 1));
}
if (!surroundingSpacesNeedQuotes || currentColumn.length() > 0) {
appendSpaces(currentColumn, potentialSpaces);
}
potentialSpaces = 0;
currentColumn.append(c);
}
} else {
/*
* QUOTE_MODE (within quotes).
*/
if (sawNewLineInQuote) {
if (currentColumn.length() > columnSizeLimit) {
state = TokenizerState.NORMAL;
sawNewLineInQuote = false;
throw new SuperCsvException(String.format("oversized column while reading quoted column %d beginning on line %d and ending on line %d. " + "See --columnsizelimit.", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
}
}
if (c == NEWLINE) {
/*
* Newline. Doesn't count as newline while in QUOTESCOPE. Add the newline char, reset the charIndex
* (will update to 0 for next iteration), read in the next line, then then continue to next
* character. For a large file with an unterminated quoted section (no trailing quote), this could
* cause memory issues as it will keep reading lines looking for the trailing quote. Maybe there
* should be a configurable limit on max lines to read in quoted mode?
*
* Yes I'll set the limit to be 16*1024*1024B = 16MB by default
*/
if (currentColumn.length() > columnSizeLimit) {
state = TokenizerState.NORMAL;
sawNewLineInQuote = false;
throw new SuperCsvException(String.format("oversized column while reading quoted column %d beginning on line %d and ending on line %d. " + "See --columnsizelimit.", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
}
sawNewLineInQuote = true;
currentColumn.append(NEWLINE);
// specific line terminator lost, \n will have to suffice
currentRow.append(NEWLINE);
charIndex = -1;
line = readLine();
if (line == null) {
throw new SuperCsvException(String.format("unexpected end of file while reading quoted column %d beginning on line %d and ending on line %d", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
}
// update untokenized CSV row
currentRow.append(line);
// add newline to simplify parsing
line += NEWLINE;
} else if (c == quoteChar) {
if (line.charAt(charIndex + 1) == quoteChar) {
/*
* An escaped quote (""). Add a single quote, then move the cursor so the next iteration of the
* loop will read the character following the escaped quote.
*/
currentColumn.append(c);
charIndex++;
} else {
/*
* A single quote ("). Update to NORMAL (but don't save quote), then continue to next character.
*/
state = TokenizerState.NORMAL;
sawNewLineInQuote = false;
// reset ready for next multi-line cell
quoteScopeStartingLine = -1;
// Check that we haven't gone over the column size limit
if (currentColumn.length() > columnSizeLimit) {
state = TokenizerState.NORMAL;
sawNewLineInQuote = false;
throw new SuperCsvException(String.format("oversized column while reading quoted column %d beginning on line %d and ending on line %d. " + "See --columnsizelimit.", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
}
}
} else {
/*
* Just a normal character, delimiter (they don't count in QUOTESCOPE) or space. Add the character,
* then continue to next character.
*/
currentColumn.append(c);
}
}
// read next char of the line
charIndex++;
}
}
use of org.supercsv.exception.SuperCsvException in project voltdb by VoltDB.
the class VoltSuperCSVFormatter method transform.
@Override
public Object[] transform(ByteBuffer payload) throws FormatException {
if (payload == null) {
return null;
}
String line = new String(payload.array(), payload.arrayOffset(), payload.limit(), StandardCharsets.UTF_8);
m_tokenizer.setSourceString(line);
List<String> dataList;
try {
dataList = m_csvReader.read();
} catch (IOException | SuperCsvException e) {
throw new FormatException("Failed to parse csv data", e);
}
if (dataList == null)
return null;
String[] data = dataList.toArray(new String[0]);
normalize(data);
return data;
}
Aggregations