use of io.debezium.text.Position in project debezium by debezium.
the class DdlTokenizer method tokenize.
@Override
public void tokenize(CharacterStream input, Tokens tokens) throws ParsingException {
tokens = adapt(input, tokens);
int startIndex;
int endIndex;
while (input.hasNext()) {
char c = input.next();
switch(c) {
case ' ':
case '\t':
case '\n':
case '\r':
// Just skip these whitespace characters ...
break;
// ==============================================================================================
case '#':
{
startIndex = input.index();
Position startPosition = input.position(startIndex);
// End-of-line comment ...
boolean foundLineTerminator = false;
while (input.hasNext()) {
c = input.next();
if (c == '\n' || c == '\r') {
foundLineTerminator = true;
break;
}
}
// the token won't include the '\n' or '\r' character(s)
endIndex = input.index();
// must point beyond last char
if (!foundLineTerminator)
++endIndex;
if (c == '\r' && input.isNext('\n'))
input.next();
if (useComments) {
tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
}
break;
}
// ==============================================================================================
case '-':
{
startIndex = input.index();
Position startPosition = input.position(startIndex);
if (input.isNext('-')) {
// -- END OF LINE comment ...
boolean foundLineTerminator = false;
while (input.hasNext()) {
c = input.next();
if (c == '\n' || c == '\r') {
foundLineTerminator = true;
break;
}
}
// the token won't include the '\n' or '\r' character(s)
endIndex = input.index();
// must point beyond last char
if (!foundLineTerminator)
++endIndex;
if (c == '\r' && input.isNext('\n'))
input.next();
if (useComments) {
tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
}
} else {
// just a regular dash ...
tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
}
break;
}
// ==============================================================================================
case '(':
case ')':
case '{':
case '}':
case '*':
case ',':
case ';':
case '+':
case '%':
case '?':
case '[':
case ']':
case '!':
case '<':
case '>':
case '|':
case '=':
case ':':
tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
break;
case '.':
tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
break;
case '\"':
startIndex = input.index();
Position startingPosition = input.position(startIndex);
boolean foundClosingQuote = false;
while (input.hasNext()) {
c = input.next();
if ((c == '\\' || c == '"') && input.isNext('"')) {
// consume the ' character since it is escaped
c = input.next();
} else if (c == '"') {
foundClosingQuote = true;
break;
}
}
if (!foundClosingQuote) {
String msg = "No matching double quote found after at line " + startingPosition.line() + ", column " + startingPosition.column();
throw new ParsingException(startingPosition, msg);
}
// beyond last character read
endIndex = input.index() + 1;
if (removeQuotes && endIndex - startIndex > 1) {
// At least one quoted character, so remove the quotes ...
startIndex += 1;
endIndex -= 1;
}
tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
break;
// back-quote character
case '`':
// left single-quote character
case '\u2018':
// right single-quote character
case '\u2019':
case // single-quote character
'\'':
char quoteChar = c;
startIndex = input.index();
startingPosition = input.position(startIndex);
foundClosingQuote = false;
while (input.hasNext()) {
c = input.next();
if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) {
// consume the character since it is escaped
c = input.next();
} else if (c == quoteChar) {
foundClosingQuote = true;
break;
}
}
if (!foundClosingQuote) {
String msg = "No matching single quote found after line " + startingPosition.line() + ", column " + startingPosition.column();
throw new ParsingException(startingPosition, msg);
}
// beyond last character read
endIndex = input.index() + 1;
if (removeQuotes && endIndex - startIndex > 1) {
// At least one quoted character, so remove the quotes ...
startIndex += 1;
endIndex -= 1;
}
tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
break;
case '/':
startIndex = input.index();
startingPosition = input.position(startIndex);
if (input.isNext('/')) {
// End-of-line comment ...
boolean foundLineTerminator = false;
while (input.hasNext()) {
c = input.next();
if (c == '\n' || c == '\r') {
foundLineTerminator = true;
break;
}
}
// the token won't include the '\n' or '\r' character(s)
endIndex = input.index();
// must point beyond last char
if (!foundLineTerminator)
++endIndex;
if (c == '\r' && input.isNext('\n'))
input.next();
if (useComments) {
tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
}
} else if (input.isNext('*')) {
// Multi-line comment ...
while (input.hasNext() && !input.isNext('*', '/')) {
c = input.next();
}
// consume the '*'
if (input.hasNext())
input.next();
// consume the '/'
if (input.hasNext())
input.next();
// the token will include the '/' and '*' characters
endIndex = input.index() + 1;
if (useComments) {
tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
}
} else {
// just a regular slash ...
tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
}
break;
default:
startIndex = input.index();
Position startPosition = input.position(startIndex);
// Read until another whitespace/symbol/decimal/slash/quote is found
while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:'`\u2018\u2019\"\u2019"))) {
c = input.next();
}
// beyond last character that was included
endIndex = input.index() + 1;
tokens.addToken(startPosition, startIndex, endIndex, WORD);
}
}
}
Aggregations