use of org.antlr.v4.tool.Grammar in project antlr4 by antlr.
the class ATNSerializer method serialize.
/** Serialize state descriptors, edge descriptors, and decision→state map
* into list of ints:
* grammar-type, (ANTLRParser.LEXER, ...)
* max token type,
* num states,
* state-0-type ruleIndex, state-1-type ruleIndex, ... state-i-type ruleIndex optional-arg ...
* num rules,
* rule-1-start-state rule-1-args, rule-2-start-state rule-2-args, ...
* (args are token type,actionIndex in lexer else 0,0)
* num modes,
* mode-0-start-state, mode-1-start-state, ... (parser has 0 modes)
* num unicode-bmp-sets
* bmp-set-0-interval-count intervals, bmp-set-1-interval-count intervals, ...
* num unicode-smp-sets
* smp-set-0-interval-count intervals, smp-set-1-interval-count intervals, ...
* num total edges,
* src, trg, edge-type, edge arg1, optional edge arg2 (present always), ...
* num decisions,
* decision-0-start-state, decision-1-start-state, ...
* Convenient to pack into unsigned shorts to make as Java string.
public IntegerList serialize() {
IntegerList data = new IntegerList();
serializeUUID(data, ATNDeserializer.SERIALIZED_UUID);
// convert grammar type to ATN const to avoid dependence on ANTLRParser
int nedges = 0;
// Note that we use a LinkedHashMap as a set to
// maintain insertion order while deduplicating
// entries with the same key.
Map<IntervalSet, Boolean> sets = new LinkedHashMap<>();
// dump states, count edges and collect sets while doing so
IntegerList nonGreedyStates = new IntegerList();
IntegerList precedenceStates = new IntegerList();
for (ATNState s : atn.states) {
if (s == null) {
// might be optimized away
int stateType = s.getStateType();
if (s instanceof DecisionState && ((DecisionState) s).nonGreedy) {
if (s instanceof RuleStartState && ((RuleStartState) s).isLeftRecursiveRule) {
if (s.ruleIndex == -1) {
} else {
if (s.getStateType() == ATNState.LOOP_END) {
data.add(((LoopEndState) s).loopBackState.stateNumber);
} else if (s instanceof BlockStartState) {
data.add(((BlockStartState) s).endState.stateNumber);
if (s.getStateType() != ATNState.RULE_STOP) {
// the deserializer can trivially derive these edges, so there's no need to serialize them
nedges += s.getNumberOfTransitions();
for (int i = 0; i < s.getNumberOfTransitions(); i++) {
Transition t = s.transition(i);
int edgeType = Transition.serializationTypes.get(t.getClass());
if (edgeType == Transition.SET || edgeType == Transition.NOT_SET) {
SetTransition st = (SetTransition) t;
sets.put(st.set, true);
// non-greedy states
for (int i = 0; i < nonGreedyStates.size(); i++) {
// precedence states
for (int i = 0; i < precedenceStates.size(); i++) {
int nrules = atn.ruleToStartState.length;
for (int r = 0; r < nrules; r++) {
ATNState ruleStartState = atn.ruleToStartState[r];
if (atn.grammarType == ATNType.LEXER) {
if (atn.ruleToTokenType[r] == Token.EOF) {
} else {
int nmodes = atn.modeToStartState.size();
if (nmodes > 0) {
for (ATNState modeStartState : atn.modeToStartState) {
List<IntervalSet> bmpSets = new ArrayList<>();
List<IntervalSet> smpSets = new ArrayList<>();
for (IntervalSet set : sets.keySet()) {
if (set.getMaxElement() <= Character.MAX_VALUE) {
} else {
serializeSets(data, bmpSets, new CodePointSerializer() {
public void serializeCodePoint(IntegerList data, int cp) {
serializeSets(data, smpSets, new CodePointSerializer() {
public void serializeCodePoint(IntegerList data, int cp) {
serializeInt(data, cp);
Map<IntervalSet, Integer> setIndices = new HashMap<>();
int setIndex = 0;
for (IntervalSet bmpSet : bmpSets) {
setIndices.put(bmpSet, setIndex++);
for (IntervalSet smpSet : smpSets) {
setIndices.put(smpSet, setIndex++);
for (ATNState s : atn.states) {
if (s == null) {
// might be optimized away
if (s.getStateType() == ATNState.RULE_STOP) {
for (int i = 0; i < s.getNumberOfTransitions(); i++) {
Transition t = s.transition(i);
if (atn.states.get( == null) {
throw new IllegalStateException("Cannot serialize a transition to a removed state.");
int src = s.stateNumber;
int trg =;
int edgeType = Transition.serializationTypes.get(t.getClass());
int arg1 = 0;
int arg2 = 0;
int arg3 = 0;
switch(edgeType) {
case Transition.RULE:
trg = ((RuleTransition) t).followState.stateNumber;
arg1 = ((RuleTransition) t).target.stateNumber;
arg2 = ((RuleTransition) t).ruleIndex;
arg3 = ((RuleTransition) t).precedence;
case Transition.PRECEDENCE:
PrecedencePredicateTransition ppt = (PrecedencePredicateTransition) t;
arg1 = ppt.precedence;
case Transition.PREDICATE:
PredicateTransition pt = (PredicateTransition) t;
arg1 = pt.ruleIndex;
arg2 = pt.predIndex;
arg3 = pt.isCtxDependent ? 1 : 0;
case Transition.RANGE:
arg1 = ((RangeTransition) t).from;
arg2 = ((RangeTransition) t).to;
if (arg1 == Token.EOF) {
arg1 = 0;
arg3 = 1;
case Transition.ATOM:
arg1 = ((AtomTransition) t).label;
if (arg1 == Token.EOF) {
arg1 = 0;
arg3 = 1;
case Transition.ACTION:
ActionTransition at = (ActionTransition) t;
arg1 = at.ruleIndex;
arg2 = at.actionIndex;
if (arg2 == -1) {
arg2 = 0xFFFF;
arg3 = at.isCtxDependent ? 1 : 0;
case Transition.SET:
arg1 = setIndices.get(((SetTransition) t).set);
case Transition.NOT_SET:
arg1 = setIndices.get(((SetTransition) t).set);
case Transition.WILDCARD:
int ndecisions = atn.decisionToState.size();
for (DecisionState decStartState : atn.decisionToState) {
if (atn.grammarType == ATNType.LEXER) {
for (LexerAction action : atn.lexerActions) {
switch(action.getActionType()) {
int channel = ((LexerChannelAction) action).getChannel();
data.add(channel != -1 ? channel : 0xFFFF);
case CUSTOM:
int ruleIndex = ((LexerCustomAction) action).getRuleIndex();
int actionIndex = ((LexerCustomAction) action).getActionIndex();
data.add(ruleIndex != -1 ? ruleIndex : 0xFFFF);
data.add(actionIndex != -1 ? actionIndex : 0xFFFF);
case MODE:
int mode = ((LexerModeAction) action).getMode();
data.add(mode != -1 ? mode : 0xFFFF);
case MORE:
case POP_MODE:
mode = ((LexerPushModeAction) action).getMode();
data.add(mode != -1 ? mode : 0xFFFF);
case SKIP:
case TYPE:
int type = ((LexerTypeAction) action).getType();
data.add(type != -1 ? type : 0xFFFF);
String message = String.format(Locale.getDefault(), "The specified lexer action type %s is not valid.", action.getActionType());
throw new IllegalArgumentException(message);
// don't adjust the first value since that's the version number
for (int i = 1; i < data.size(); i++) {
if (data.get(i) < Character.MIN_VALUE || data.get(i) > Character.MAX_VALUE) {
throw new UnsupportedOperationException("Serialized ATN data element " + data.get(i) + " element " + i + " out of range " + (int) Character.MIN_VALUE + ".." + (int) Character.MAX_VALUE);
int value = (data.get(i) + 2) & 0xFFFF;
data.set(i, value);
return data;
use of org.antlr.v4.tool.Grammar in project antlr4 by antlr.
the class DefaultErrorStrategy method getErrorRecoverySet.
/* Compute the error recovery set for the current rule. During
* rule invocation, the parser pushes the set of tokens that can
* follow that rule reference on the stack; this amounts to
* computing FIRST of what follows the rule reference in the
* enclosing rule. See LinearApproximator.FIRST().
* This local follow set only includes tokens
* from within the rule; i.e., the FIRST computation done by
* ANTLR stops at the end of a rule.
* When you find a "no viable alt exception", the input is not
* consistent with any of the alternatives for rule r. The best
* thing to do is to consume tokens until you see something that
* can legally follow a call to r *or* any rule that called r.
* You don't want the exact set of viable next tokens because the
* input might just be missing a token--you might consume the
* rest of the input looking for one of the missing tokens.
* Consider grammar:
* a : '[' b ']'
* | '(' b ')'
* ;
* b : c '^' INT ;
* c : ID
* | INT
* ;
* At each rule invocation, the set of tokens that could follow
* that rule is pushed on a stack. Here are the various
* context-sensitive follow sets:
* FOLLOW(b1_in_a) = FIRST(']') = ']'
* FOLLOW(b2_in_a) = FIRST(')') = ')'
* FOLLOW(c_in_b) = FIRST('^') = '^'
* Upon erroneous input "[]", the call chain is
* a -> b -> c
* and, hence, the follow context stack is:
* depth follow set start of rule execution
* 0 <EOF> a (from main())
* 1 ']' b
* 2 '^' c
* Notice that ')' is not included, because b would have to have
* been called from a different context in rule a for ')' to be
* included.
* For error recovery, we cannot consider FOLLOW(c)
* (context-sensitive or otherwise). We need the combined set of
* all context-sensitive FOLLOW sets--the set of all tokens that
* could follow any reference in the call chain. We need to
* resync to one of those tokens. Note that FOLLOW(c)='^' and if
* we resync'd to that token, we'd consume until EOF. We need to
* sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
* In this case, for input "[]", LA(1) is ']' and in the set, so we would
* not consume anything. After printing an error, rule c would
* return normally. Rule b would not find the required '^' though.
* At this point, it gets a mismatched token error and throws an
* exception (since LA(1) is not in the viable following token
* set). The rule exception handler tries to recover, but finds
* the same recovery set and doesn't consume anything. Rule b
* exits normally returning to rule a. Now it finds the ']' (and
* with the successful match exits errorRecovery mode).
* So, you can see that the parser walks up the call chain looking
* for the token that was a member of the recovery set.
* Errors are not generated in errorRecovery mode.
* ANTLR's error recovery mechanism is based upon original ideas:
* "Algorithms + Data Structures = Programs" by Niklaus Wirth
* and
* "A note on error recovery in recursive descent parsers":
* Later, Josef Grosch had some good ideas:
* "Efficient and Comfortable Error Recovery in Recursive Descent
* Parsers":
* Like Grosch I implement context-sensitive FOLLOW sets that are combined
* at run-time upon error to avoid overhead during parsing.
protected IntervalSet getErrorRecoverySet(Parser recognizer) {
ATN atn = recognizer.getInterpreter().atn;
RuleContext ctx = recognizer._ctx;
IntervalSet recoverSet = new IntervalSet();
while (ctx != null && ctx.invokingState >= 0) {
// compute what follows who invoked us
ATNState invokingState = atn.states.get(ctx.invokingState);
RuleTransition rt = (RuleTransition) invokingState.transition(0);
IntervalSet follow = atn.nextTokens(rt.followState);
ctx = ctx.parent;
// System.out.println("recover set "+recoverSet.toString(recognizer.getTokenNames()));
return recoverSet;
use of org.antlr.v4.tool.Grammar in project antlr4 by antlr.
the class Grammar method importTokensFromTokensFile.
public void importTokensFromTokensFile() {
String vocab = getOptionString("tokenVocab");
if (vocab != null) {
TokenVocabParser vparser = new TokenVocabParser(this);
Map<String, Integer> tokens = vparser.load();
tool.log("grammar", "tokens=" + tokens);
for (String t : tokens.keySet()) {
if (t.charAt(0) == '\'')
defineStringLiteral(t, tokens.get(t));
defineTokenName(t, tokens.get(t));
use of org.antlr.v4.tool.Grammar in project antlr4 by antlr.
the class Grammar method loadImportedGrammars.
public void loadImportedGrammars() {
if (ast == null)
GrammarAST i = (GrammarAST) ast.getFirstChildWithType(ANTLRParser.IMPORT);
if (i == null)
Set<String> visited = new HashSet<>();
importedGrammars = new ArrayList<Grammar>();
for (Object c : i.getChildren()) {
GrammarAST t = (GrammarAST) c;
String importedGrammarName = null;
if (t.getType() == ANTLRParser.ASSIGN) {
t = (GrammarAST) t.getChild(1);
importedGrammarName = t.getText();
} else if (t.getType() == ANTLRParser.ID) {
importedGrammarName = t.getText();
if (visited.contains(importedGrammarName)) {
// ignore circular refs
Grammar g;
try {
g = tool.loadImportedGrammar(this, t);
} catch (IOException ioe) {
tool.errMgr.grammarError(ErrorType.ERROR_READING_IMPORTED_GRAMMAR, importedGrammarName, t.getToken(), importedGrammarName, name);
// did it come back as error node or missing?
if (g == null)
g.parent = this;
// recursively pursue any imports in this import
use of org.antlr.v4.tool.Grammar in project antlr4 by antlr.
the class GrammarParserInterpreter method getAllPossibleParseTrees.
/** Given an ambiguous parse information, return the list of ambiguous parse trees.
* An ambiguity occurs when a specific token sequence can be recognized
* in more than one way by the grammar. These ambiguities are detected only
* at decision points.
* The list of trees includes the actual interpretation (that for
* the minimum alternative number) and all ambiguous alternatives.
* The actual interpretation is always first.
* This method reuses the same physical input token stream used to
* detect the ambiguity by the original parser in the first place.
* This method resets/seeks within but does not alter originalParser.
* The trees are rooted at the node whose start..stop token indices
* include the start and stop indices of this ambiguity event. That is,
* the trees returned will always include the complete ambiguous subphrase
* identified by the ambiguity event. The subtrees returned will
* also always contain the node associated with the overridden decision.
* Be aware that this method does NOT notify error or parse listeners as
* it would trigger duplicate or otherwise unwanted events.
* This uses a temporary ParserATNSimulator and a ParserInterpreter
* so we don't mess up any statistics, event lists, etc...
* The parse tree constructed while identifying/making ambiguityInfo is
* not affected by this method as it creates a new parser interp to
* get the ambiguous interpretations.
* Nodes in the returned ambig trees are independent of the original parse
* tree (constructed while identifying/creating ambiguityInfo).
* @since 4.5.1
* @param g From which grammar should we drive alternative
* numbers and alternative labels.
* @param originalParser The parser used to create ambiguityInfo; it
* is not modified by this routine and can be either
* a generated or interpreted parser. It's token
* stream *is* reset/seek()'d.
* @param tokens A stream of tokens to use with the temporary parser.
* This will often be just the token stream within the
* original parser but here it is for flexibility.
* @param decision Which decision to try different alternatives for.
* @param alts The set of alternatives to try while re-parsing.
* @param startIndex The index of the first token of the ambiguous
* input or other input of interest.
* @param stopIndex The index of the last token of the ambiguous input.
* The start and stop indexes are used primarily to
* identify how much of the resulting parse tree
* to return.
* @param startRuleIndex The start rule for the entire grammar, not
* the ambiguous decision. We re-parse the entire input
* and so we need the original start rule.
* @return The list of all possible interpretations of
* the input for the decision in ambiguityInfo.
* The actual interpretation chosen by the parser
* is always given first because this method
* retests the input in alternative order and
* ANTLR always resolves ambiguities by choosing
* the first alternative that matches the input.
* The subtree returned
* @throws RecognitionException Throws upon syntax error while matching
* ambig input.
public static List<ParserRuleContext> getAllPossibleParseTrees(Grammar g, Parser originalParser, TokenStream tokens, int decision, BitSet alts, int startIndex, int stopIndex, int startRuleIndex) throws RecognitionException {
List<ParserRuleContext> trees = new ArrayList<ParserRuleContext>();
// Create a new parser interpreter to parse the ambiguous subphrase
ParserInterpreter parser = deriveTempParserInterpreter(g, originalParser, tokens);
if (stopIndex >= (tokens.size() - 1)) {
// if we are pointing at EOF token
// EOF is not in tree, so must be 1 less than last non-EOF token
stopIndex = tokens.size() - 2;
// get ambig trees
int alt = alts.nextSetBit(0);
while (alt >= 0) {
// re-parse entire input for all ambiguous alternatives
// (don't have to do first as it's been parsed, but do again for simplicity
// using this temp parser.)
parser.addDecisionOverride(decision, startIndex, alt);
ParserRuleContext t = parser.parse(startRuleIndex);
GrammarInterpreterRuleContext ambigSubTree = (GrammarInterpreterRuleContext) Trees.getRootOfSubtreeEnclosingRegion(t, startIndex, stopIndex);
// Use higher of overridden decision tree or tree enclosing all tokens
if (Trees.isAncestorOf(parser.getOverrideDecisionRoot(), ambigSubTree)) {
ambigSubTree = (GrammarInterpreterRuleContext) parser.getOverrideDecisionRoot();
alt = alts.nextSetBit(alt + 1);
return trees;