Search in sources :

Example 1 with SuperCsvException

use of org.supercsv.exception.SuperCsvException in project voltdb by VoltDB.

the class Tokenizer method readColumns.

/**
     * {@inheritDoc}
     */
@Override
public boolean readColumns(final List<String> columns) throws IOException {
    if (columns == null) {
        throw new NullPointerException("columns should not be null");
    }
    // clear the reusable List and StringBuilders
    columns.clear();
    currentColumn.setLength(0);
    currentRow.setLength(0);
    // keep reading lines until data is found
    String line;
    do {
        line = readLine();
        if (line == null) {
            // EOF
            return false;
        }
    } while (line.length() == 0 || line.trim().isEmpty() || (commentMatcher != null && commentMatcher.isComment(line)) || (!header && skip >= getLineNumber()));
    if (header) {
        header = false;
        skip++;
    }
    // update the untokenized CSV row
    currentRow.append(line);
    // add a newline to determine end of line (making parsing easier)
    line += NEWLINE;
    // process each character in the line, catering for surrounding quotes (QUOTE_MODE)
    TokenizerState state = TokenizerState.NORMAL;
    // the line number where a potential multi-line cell starts
    int quoteScopeStartingLine = -1;
    // keep track of spaces (so leading/trailing space can be removed if required)
    int potentialSpaces = 0;
    int charIndex = 0;
    boolean espectQuote = this.strictQuotes;
    boolean isEscape = false;
    boolean sawNewLineInQuote = false;
    while (true) {
        final char c = line.charAt(charIndex);
        if (TokenizerState.NORMAL.equals(state)) {
            /*
		 * NORMAL mode (not within quotes).
		 */
            if (isEscape) {
                /*
		     * Just a normal character. Add any required spaces (but trim any leading spaces if surrounding
		     * spaces need quotes), add the character, then continue to next character.
		     */
                isEscape = false;
                if (this.strictQuotes && espectQuote) {
                    throw new SuperCsvException(String.format("strictQuotes: quotes needed at line %d column %d. To proceed, " + "either quote the column or remove --strictquotes", getLineNumber(), columns.size() + 1));
                }
                if (!surroundingSpacesNeedQuotes || currentColumn.length() > 0) {
                    appendSpaces(currentColumn, potentialSpaces);
                }
                potentialSpaces = 0;
                currentColumn.append(c);
            } else if (c == escapeChar && !(line.charAt(charIndex + 1) == 'N')) {
                isEscape = true;
            } else if (c == delimeterChar) {
                /*
		     * Delimiter. Save the column (trim trailing space if required) then continue to next character.
		     */
                espectQuote = true;
                if (!surroundingSpacesNeedQuotes) {
                    appendSpaces(currentColumn, potentialSpaces);
                }
                // "" -> null
                columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null);
                potentialSpaces = 0;
                currentColumn.setLength(0);
            } else if (c == SPACE) {
                /*
		     * Space. Remember it, then continue to next character.
		     */
                potentialSpaces++;
            } else if (c == NEWLINE) {
                /*
		     * Newline. Add any required spaces (if surrounding spaces don't need quotes) and return (we've read
		     * a line!).
		     */
                if (!surroundingSpacesNeedQuotes) {
                    appendSpaces(currentColumn, potentialSpaces);
                }
                // "" -> null
                columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null);
                return true;
            } else if (c == quoteChar) {
                /*
		     * A single quote ("). Update to QUOTESCOPE (but don't save quote), then continue to next character.
		     */
                espectQuote = false;
                state = TokenizerState.QUOTE_MODE;
                quoteScopeStartingLine = getLineNumber();
                // cater for spaces before a quoted section (be lenient!)
                if (!surroundingSpacesNeedQuotes || currentColumn.length() > 0) {
                    appendSpaces(currentColumn, potentialSpaces);
                }
                potentialSpaces = 0;
            } else {
                /*
		     * Just a normal character. Add any required spaces (but trim any leading spaces if surrounding
		     * spaces need quotes), add the character, then continue to next character.
		     */
                if (this.strictQuotes && espectQuote) {
                    throw new SuperCsvException(String.format("strictQuotes: quotes needed at line %d column %d. To proceed, " + "either quote the column or remove --strictquotes", getLineNumber(), columns.size() + 1));
                }
                if (!surroundingSpacesNeedQuotes || currentColumn.length() > 0) {
                    appendSpaces(currentColumn, potentialSpaces);
                }
                potentialSpaces = 0;
                currentColumn.append(c);
            }
        } else {
            /*
		 * QUOTE_MODE (within quotes).
		 */
            if (sawNewLineInQuote) {
                if (currentColumn.length() > columnSizeLimit) {
                    state = TokenizerState.NORMAL;
                    sawNewLineInQuote = false;
                    throw new SuperCsvException(String.format("oversized column while reading quoted column %d beginning on line %d and ending on line %d. " + "See --columnsizelimit.", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
                }
            }
            if (c == NEWLINE) {
                /*
		     * Newline. Doesn't count as newline while in QUOTESCOPE. Add the newline char, reset the charIndex
		     * (will update to 0 for next iteration), read in the next line, then then continue to next
		     * character. For a large file with an unterminated quoted section (no trailing quote), this could
		     * cause memory issues as it will keep reading lines looking for the trailing quote. Maybe there
		     * should be a configurable limit on max lines to read in quoted mode?
		     *
		     * Yes I'll set the limit to be 16*1024*1024B = 16MB by default
		     */
                if (currentColumn.length() > columnSizeLimit) {
                    state = TokenizerState.NORMAL;
                    sawNewLineInQuote = false;
                    throw new SuperCsvException(String.format("oversized column while reading quoted column %d beginning on line %d and ending on line %d. " + "See --columnsizelimit.", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
                }
                sawNewLineInQuote = true;
                currentColumn.append(NEWLINE);
                // specific line terminator lost, \n will have to suffice
                currentRow.append(NEWLINE);
                charIndex = -1;
                line = readLine();
                if (line == null) {
                    throw new SuperCsvException(String.format("unexpected end of file while reading quoted column %d beginning on line %d and ending on line %d", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
                }
                // update untokenized CSV row
                currentRow.append(line);
                // add newline to simplify parsing
                line += NEWLINE;
            } else if (c == quoteChar) {
                if (line.charAt(charIndex + 1) == quoteChar) {
                    /*
			 * An escaped quote (""). Add a single quote, then move the cursor so the next iteration of the
			 * loop will read the character following the escaped quote.
			 */
                    currentColumn.append(c);
                    charIndex++;
                } else {
                    /*
			 * A single quote ("). Update to NORMAL (but don't save quote), then continue to next character.
			 */
                    state = TokenizerState.NORMAL;
                    sawNewLineInQuote = false;
                    // reset ready for next multi-line cell
                    quoteScopeStartingLine = -1;
                    // Check that we haven't gone over the column size limit
                    if (currentColumn.length() > columnSizeLimit) {
                        state = TokenizerState.NORMAL;
                        sawNewLineInQuote = false;
                        throw new SuperCsvException(String.format("oversized column while reading quoted column %d beginning on line %d and ending on line %d. " + "See --columnsizelimit.", columns.size() + 1, quoteScopeStartingLine, getLineNumber()));
                    }
                }
            } else {
                /*
		     * Just a normal character, delimiter (they don't count in QUOTESCOPE) or space. Add the character,
		     * then continue to next character.
		     */
                currentColumn.append(c);
            }
        }
        // read next char of the line
        charIndex++;
    }
}
Also used : SuperCsvException(org.supercsv.exception.SuperCsvException)

Example 2 with SuperCsvException

use of org.supercsv.exception.SuperCsvException in project voltdb by VoltDB.

the class VoltSuperCSVFormatter method transform.

@Override
public Object[] transform(ByteBuffer payload) throws FormatException {
    if (payload == null) {
        return null;
    }
    String line = new String(payload.array(), payload.arrayOffset(), payload.limit(), StandardCharsets.UTF_8);
    m_tokenizer.setSourceString(line);
    List<String> dataList;
    try {
        dataList = m_csvReader.read();
    } catch (IOException | SuperCsvException e) {
        throw new FormatException("Failed to parse csv data", e);
    }
    if (dataList == null)
        return null;
    String[] data = dataList.toArray(new String[0]);
    normalize(data);
    return data;
}
Also used : IOException(java.io.IOException) FormatException(org.voltdb.importer.formatter.FormatException) SuperCsvException(org.supercsv.exception.SuperCsvException)

Aggregations

SuperCsvException (org.supercsv.exception.SuperCsvException)2 IOException (java.io.IOException)1 FormatException (org.voltdb.importer.formatter.FormatException)1