Search in sources :

Example 1 with Position

use of io.debezium.text.Position in project debezium by debezium.

the class DdlTokenizer method tokenize.

@Override
public void tokenize(CharacterStream input, Tokens tokens) throws ParsingException {
    tokens = adapt(input, tokens);
    int startIndex;
    int endIndex;
    while (input.hasNext()) {
        char c = input.next();
        switch(c) {
            case ' ':
            case '\t':
            case '\n':
            case '\r':
                // Just skip these whitespace characters ...
                break;
            // ==============================================================================================
            case '#':
                {
                    startIndex = input.index();
                    Position startPosition = input.position(startIndex);
                    // End-of-line comment ...
                    boolean foundLineTerminator = false;
                    while (input.hasNext()) {
                        c = input.next();
                        if (c == '\n' || c == '\r') {
                            foundLineTerminator = true;
                            break;
                        }
                    }
                    // the token won't include the '\n' or '\r' character(s)
                    endIndex = input.index();
                    // must point beyond last char
                    if (!foundLineTerminator)
                        ++endIndex;
                    if (c == '\r' && input.isNext('\n'))
                        input.next();
                    if (useComments) {
                        tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
                    }
                    break;
                }
            // ==============================================================================================
            case '-':
                {
                    startIndex = input.index();
                    Position startPosition = input.position(startIndex);
                    if (input.isNext('-')) {
                        // -- END OF LINE comment ...
                        boolean foundLineTerminator = false;
                        while (input.hasNext()) {
                            c = input.next();
                            if (c == '\n' || c == '\r') {
                                foundLineTerminator = true;
                                break;
                            }
                        }
                        // the token won't include the '\n' or '\r' character(s)
                        endIndex = input.index();
                        // must point beyond last char
                        if (!foundLineTerminator)
                            ++endIndex;
                        if (c == '\r' && input.isNext('\n'))
                            input.next();
                        if (useComments) {
                            tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
                        }
                    } else {
                        // just a regular dash ...
                        tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
                    }
                    break;
                }
            // ==============================================================================================
            case '(':
            case ')':
            case '{':
            case '}':
            case '*':
            case ',':
            case ';':
            case '+':
            case '%':
            case '?':
            case '[':
            case ']':
            case '!':
            case '<':
            case '>':
            case '|':
            case '=':
            case ':':
                tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
                break;
            case '.':
                tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
                break;
            case '\"':
                startIndex = input.index();
                Position startingPosition = input.position(startIndex);
                boolean foundClosingQuote = false;
                while (input.hasNext()) {
                    c = input.next();
                    if ((c == '\\' || c == '"') && input.isNext('"')) {
                        // consume the ' character since it is escaped
                        c = input.next();
                    } else if (c == '"') {
                        foundClosingQuote = true;
                        break;
                    }
                }
                if (!foundClosingQuote) {
                    String msg = "No matching double quote found after at line " + startingPosition.line() + ", column " + startingPosition.column();
                    throw new ParsingException(startingPosition, msg);
                }
                // beyond last character read
                endIndex = input.index() + 1;
                if (removeQuotes && endIndex - startIndex > 1) {
                    // At least one quoted character, so remove the quotes ...
                    startIndex += 1;
                    endIndex -= 1;
                }
                tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
                break;
            // back-quote character
            case '`':
            // left single-quote character
            case '\u2018':
            // right single-quote character
            case '\u2019':
            case // single-quote character
            '\'':
                char quoteChar = c;
                startIndex = input.index();
                startingPosition = input.position(startIndex);
                foundClosingQuote = false;
                while (input.hasNext()) {
                    c = input.next();
                    if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) {
                        // consume the character since it is escaped
                        c = input.next();
                    } else if (c == quoteChar) {
                        foundClosingQuote = true;
                        break;
                    }
                }
                if (!foundClosingQuote) {
                    String msg = "No matching single quote found after line " + startingPosition.line() + ", column " + startingPosition.column();
                    throw new ParsingException(startingPosition, msg);
                }
                // beyond last character read
                endIndex = input.index() + 1;
                if (removeQuotes && endIndex - startIndex > 1) {
                    // At least one quoted character, so remove the quotes ...
                    startIndex += 1;
                    endIndex -= 1;
                }
                tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
                break;
            case '/':
                startIndex = input.index();
                startingPosition = input.position(startIndex);
                if (input.isNext('/')) {
                    // End-of-line comment ...
                    boolean foundLineTerminator = false;
                    while (input.hasNext()) {
                        c = input.next();
                        if (c == '\n' || c == '\r') {
                            foundLineTerminator = true;
                            break;
                        }
                    }
                    // the token won't include the '\n' or '\r' character(s)
                    endIndex = input.index();
                    // must point beyond last char
                    if (!foundLineTerminator)
                        ++endIndex;
                    if (c == '\r' && input.isNext('\n'))
                        input.next();
                    if (useComments) {
                        tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
                    }
                } else if (input.isNext('*')) {
                    // Multi-line comment ...
                    while (input.hasNext() && !input.isNext('*', '/')) {
                        c = input.next();
                    }
                    // consume the '*'
                    if (input.hasNext())
                        input.next();
                    // consume the '/'
                    if (input.hasNext())
                        input.next();
                    // the token will include the '/' and '*' characters
                    endIndex = input.index() + 1;
                    if (useComments) {
                        tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
                    }
                } else {
                    // just a regular slash ...
                    tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
                }
                break;
            default:
                startIndex = input.index();
                Position startPosition = input.position(startIndex);
                // Read until another whitespace/symbol/decimal/slash/quote is found
                while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:'`\u2018\u2019\"\u2019"))) {
                    c = input.next();
                }
                // beyond last character that was included
                endIndex = input.index() + 1;
                tokens.addToken(startPosition, startIndex, endIndex, WORD);
        }
    }
}
Also used : Position(io.debezium.text.Position) ParsingException(io.debezium.text.ParsingException)

Aggregations

ParsingException (io.debezium.text.ParsingException)1 Position (io.debezium.text.Position)1