use of edu.stanford.nlp.parser.common.ParserConstraint in project CoreNLP by stanfordnlp.
the class LatticeXMLReader method load.
private boolean load(InputStream stream) {
DocumentBuilder parser = XMLUtils.getXmlParser();
if (parser == null)
return false;
try {
Document xmlDocument = parser.parse(stream);
Element root = xmlDocument.getDocumentElement();
NodeList sentences = root.getElementsByTagName(SENTENCE);
for (int i = 0; i < sentences.getLength(); i++) {
Element sentence = (Element) sentences.item(i);
Lattice lattice = new Lattice();
//Create the node map
SortedSet<Integer> nodes = new TreeSet<>();
NodeList xmlNodes = sentence.getElementsByTagName(NODE);
for (int nodeIdx = 0; nodeIdx < xmlNodes.getLength(); nodeIdx++) {
Element xmlNode = (Element) xmlNodes.item(nodeIdx);
int nodeName = Integer.parseInt(xmlNode.getAttribute(NODE_ID));
nodes.add(nodeName);
}
Map<Integer, Integer> nodeMap = Generics.newHashMap();
int realNodeIdx = 0;
int lastBoundaryNode = -1;
for (int nodeName : nodes) {
if (lastBoundaryNode == -1) {
assert nodeName % NODE_OFFSET == 0;
lastBoundaryNode = realNodeIdx;
} else if (nodeName % NODE_OFFSET == 0) {
ParserConstraint c = new ParserConstraint(lastBoundaryNode, realNodeIdx, ".*");
lattice.addConstraint(c);
}
nodeMap.put(nodeName, realNodeIdx);
realNodeIdx++;
}
//Read the edges
NodeList xmlEdges = sentence.getElementsByTagName(EDGE);
for (int edgeIdx = 0; edgeIdx < xmlEdges.getLength(); edgeIdx++) {
Element xmlEdge = (Element) xmlEdges.item(edgeIdx);
String segment = xmlEdge.getAttribute(SEGMENT);
//Input weights should be log scale
double weight = Double.parseDouble(xmlEdge.getAttribute(WEIGHT));
int from = Integer.parseInt(xmlEdge.getAttribute(FROM_NODE));
int normFrom = nodeMap.get(from);
int to = Integer.parseInt(xmlEdge.getAttribute(TO_NODE));
int normTo = nodeMap.get(to);
LatticeEdge e = new LatticeEdge(segment, weight, normFrom, normTo);
// Set attributes below here
NodeList xmlAttrs = xmlEdge.getElementsByTagName(E_ATTR_NODE);
for (int attrIdx = 0; attrIdx < xmlAttrs.getLength(); attrIdx++) {
Element xmlAttr = (Element) xmlAttrs.item(attrIdx);
String key = xmlAttr.getAttribute(E_ATTR);
String value = xmlAttr.getAttribute(E_ATTR_VAL);
e.setAttr(key, value);
}
lattice.addEdge(e);
}
//Configure for parsing in ExhaustivePCFG parser
lattice.addBoundary();
lattices.add(lattice);
}
} catch (IOException e) {
System.err.printf("%s: Error reading XML from input stream.%n", this.getClass().getName());
e.printStackTrace();
return false;
} catch (SAXException e) {
e.printStackTrace();
return false;
}
return true;
}
use of edu.stanford.nlp.parser.common.ParserConstraint in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method findSyntacticHead.
public Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
// mention ends with 's
int endIdx = m.endIndex;
if (m.originalSpan.size() > 0) {
String lastWord = m.originalSpan.get(m.originalSpan.size() - 1).get(CoreAnnotations.TextAnnotation.class);
if ((lastWord.equals("'s") || lastWord.equals("'")) && m.originalSpan.size() != 1)
endIdx--;
}
Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
//
if (exactMatch != null) {
return safeHead(exactMatch, endIdx);
}
// context, so as to make the parser work better :-)
if (allowReparsing) {
int approximateness = 0;
List<CoreLabel> extentTokens = new ArrayList<>();
extentTokens.add(initCoreLabel("It"));
extentTokens.add(initCoreLabel("was"));
final int ADDED_WORDS = 2;
for (int i = m.startIndex; i < endIdx; i++) {
// Add everything except separated dashes! The separated dashes mess with the parser too badly.
CoreLabel label = tokens.get(i);
if (!"-".equals(label.word())) {
extentTokens.add(tokens.get(i));
} else {
approximateness++;
}
}
extentTokens.add(initCoreLabel("."));
// constrain the parse to the part we're interested in.
// Starting from ADDED_WORDS comes from skipping "It was".
// -1 to exclude the period.
// We now let it be any kind of nominal constituent, since there
// are VP and S ones
ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
List<ParserConstraint> constraints = Collections.singletonList(constraint);
Tree tree = parse(extentTokens, constraints);
// now unnecessary, as parser uses CoreLabels?
convertToCoreLabels(tree);
// remember it has ADDED_WORDS extra words at the beginning
tree.indexSpans(m.startIndex - ADDED_WORDS);
Tree subtree = findPartialSpan(tree, m.startIndex);
// There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
// Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
// passed the right end (that is, just that final period).
Tree extentHead = safeHead(subtree, endIdx);
assert (extentHead != null);
// extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
// Because we deleted dashes, it's index will be >= the index in the extent parse tree
CoreLabel l = (CoreLabel) extentHead.label();
Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
assert (realHead != null);
return realHead;
}
// If reparsing wasn't allowed, try to find a span in the tree
// which happens to have the head
Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
if (wordMatch != null) {
Tree head = safeHead(wordMatch, endIdx);
if (head != null) {
int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
if (index >= m.startIndex && index < endIdx) {
return head;
}
}
}
// If that didn't work, guess that it's the last word
int lastNounIdx = endIdx - 1;
for (int i = m.startIndex; i < m.endIndex; i++) {
if (tokens.get(i).tag().startsWith("N"))
lastNounIdx = i;
else if (tokens.get(i).tag().startsWith("W"))
break;
}
List<Tree> leaves = root.getLeaves();
Tree endLeaf = leaves.get(lastNounIdx);
return endLeaf;
}
use of edu.stanford.nlp.parser.common.ParserConstraint in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method findSyntacticHead.
protected Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
// mention ends with 's
int endIdx = m.endIndex;
if (m.originalSpan.size() > 0) {
String lastWord = m.originalSpan.get(m.originalSpan.size() - 1).get(CoreAnnotations.TextAnnotation.class);
if ((lastWord.equals("'s") || lastWord.equals("'")) && m.originalSpan.size() != 1)
endIdx--;
}
Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
//
if (exactMatch != null) {
return safeHead(exactMatch, endIdx);
}
// context, so as to make the parser work better :-)
if (allowReparsing) {
int approximateness = 0;
List<CoreLabel> extentTokens = new ArrayList<>();
extentTokens.add(initCoreLabel("It"));
extentTokens.add(initCoreLabel("was"));
final int ADDED_WORDS = 2;
for (int i = m.startIndex; i < endIdx; i++) {
// Add everything except separated dashes! The separated dashes mess with the parser too badly.
CoreLabel label = tokens.get(i);
if (!"-".equals(label.word())) {
// necessary to copy tokens in case the parser does things like
// put new indices on the tokens
extentTokens.add((CoreLabel) label.labelFactory().newLabel(label));
} else {
approximateness++;
}
}
extentTokens.add(initCoreLabel("."));
// constrain the parse to the part we're interested in.
// Starting from ADDED_WORDS comes from skipping "It was".
// -1 to exclude the period.
// We now let it be any kind of nominal constituent, since there
// are VP and S ones
ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
List<ParserConstraint> constraints = Collections.singletonList(constraint);
Tree tree = parse(extentTokens, constraints);
// now unnecessary, as parser uses CoreLabels?
convertToCoreLabels(tree);
// remember it has ADDED_WORDS extra words at the beginning
tree.indexSpans(m.startIndex - ADDED_WORDS);
Tree subtree = findPartialSpan(tree, m.startIndex);
// There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
// Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
// passed the right end (that is, just that final period).
Tree extentHead = safeHead(subtree, endIdx);
assert (extentHead != null);
// extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
// Because we deleted dashes, it's index will be >= the index in the extent parse tree
CoreLabel l = (CoreLabel) extentHead.label();
Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
assert (realHead != null);
return realHead;
}
// If reparsing wasn't allowed, try to find a span in the tree
// which happens to have the head
Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
if (wordMatch != null) {
Tree head = safeHead(wordMatch, endIdx);
if (head != null) {
int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
if (index >= m.startIndex && index < endIdx) {
return head;
}
}
}
// If that didn't work, guess that it's the last word
int lastNounIdx = endIdx - 1;
for (int i = m.startIndex; i < m.endIndex; i++) {
if (tokens.get(i).tag().startsWith("N"))
lastNounIdx = i;
else if (tokens.get(i).tag().startsWith("W"))
break;
}
List<Tree> leaves = root.getLeaves();
Tree endLeaf = leaves.get(lastNounIdx);
return endLeaf;
}
use of edu.stanford.nlp.parser.common.ParserConstraint in project CoreNLP by stanfordnlp.
the class PerceptronModel method findHighestScoringTransitions.
private Collection<ScoredObject<Integer>> findHighestScoringTransitions(State state, List<String> features, boolean requireLegal, int numTransitions, List<ParserConstraint> constraints) {
float[] scores = new float[transitionIndex.size()];
for (String feature : features) {
Weight weight = featureWeights.get(feature);
if (weight == null) {
// Features not in our index are ignored
continue;
}
weight.score(scores);
}
PriorityQueue<ScoredObject<Integer>> queue = new PriorityQueue<>(numTransitions + 1, ScoredComparator.ASCENDING_COMPARATOR);
for (int i = 0; i < scores.length; ++i) {
if (!requireLegal || transitionIndex.get(i).isLegal(state, constraints)) {
queue.add(new ScoredObject<>(i, scores[i]));
if (queue.size() > numTransitions) {
queue.poll();
}
}
}
return queue;
}
use of edu.stanford.nlp.parser.common.ParserConstraint in project CoreNLP by stanfordnlp.
the class LexicalizedParserITest method testConstraints.
/**
* Test what happens if you put a constraint on the parse
*/
public void testConstraints() {
List<CoreLabel> sentence = sampleSausage();
ParserQuery pq = englishParser.parserQuery();
ParserConstraint constraint = new ParserConstraint(0, 2, "SBAR|SBAR[^a-zA-Z].*");
List<ParserConstraint> constraints = new ArrayList<>();
constraints.add(constraint);
pq.setConstraints(constraints);
pq.parse(sentence);
StringWriter sw = new StringWriter();
pennPrint.printTree(pq.getBestParse(), (new PrintWriter(sw)));
String actualOutput = sw.toString().replaceAll("\\s+", " ").trim();
String expectedOutput = "(ROOT (S (NP (PRP$ My) (NN dog)) (ADVP (RB also)) (VP (VBZ likes) (S (VP (VBG eating) (NP (NN sausage))))) (. .)))";
expectedOutput = expectedOutput.replaceAll("\\s+", " ").trim();
// Not exactly sure what should come back, but it shouldn't be the
// original output any more
assertFalse("Tree should not match the original tree any more", expectedOutput.equals(actualOutput));
assertTrue("Tree should be forced to contain SBAR", actualOutput.contains("SBAR"));
//System.out.println(pq.getBestParse());
}
Aggregations