use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method originalFindSyntacticHead.
/**
* This is the original version of {@link #findSyntacticHead} before Chris's modifications.
* There's no good reason to use it except for producing historical results.
* It Finds the syntactic head of the given entity mention.
*
* @param ent The entity mention
* @param root The Tree for the entire sentence in which it occurs.
* @param tokens The Sentence in which it occurs
* @return The tree object corresponding to the head. This MUST be a child of root.
* It will be a leaf in the parse tree.
*/
public Tree originalFindSyntacticHead(EntityMention ent, Tree root, List<CoreLabel> tokens) {
logger.fine("Searching for tree matching " + ent);
Tree exactMatch = findTreeWithSpan(root, ent.getExtentTokenStart(), ent.getExtentTokenEnd());
//
if (exactMatch != null) {
logger.fine("Mention \"" + ent + "\" mapped to tree: " + printTree(exactMatch));
return safeHead(exactMatch);
}
//
// no exact match found
// in this case, we parse the actual extent of the mention
//
List<CoreLabel> extentTokens = new ArrayList<>();
for (int i = ent.getExtentTokenStart(); i < ent.getExtentTokenEnd(); i++) extentTokens.add(tokens.get(i));
Tree tree = parse(extentTokens);
logger.fine("No exact match found. Local parse:\n" + tree.pennString());
convertToCoreLabels(tree);
tree.indexSpans(ent.getExtentTokenStart());
Tree extentHead = safeHead(tree);
assert (extentHead != null);
// extentHead is a child in the local extent parse tree. we need to find the
// corresponding node in the main tree
CoreLabel l = (CoreLabel) extentHead.label();
Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
assert (realHead != null);
return realHead;
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method funkyFindLeafWithApproximateSpan.
private Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) {
logger.fine("Looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.pennString());
List<Tree> leaves = root.getLeaves();
for (Tree leaf : leaves) {
CoreLabel label = CoreLabel.class.cast(leaf.label());
int ind = label.get(CoreAnnotations.BeginIndexAnnotation.class);
// log.info("Token #" + ind + ": " + leaf.value());
if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) {
return leaf;
}
}
// this shouldn't happen
// but it does happen (VERY RARELY) on some weird web text that includes SGML tags with spaces
// TODO: does this mean that somehow tokenization is different for the parser? check this by throwing an Exception in KBP
logger.severe("GenericDataSetReader: WARNING: Failed to find head token");
logger.severe(" when looking for " + token + " at pos " + index + " plus upto " + approximateness + " in tree: " + root.pennString());
return null;
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method assignSyntacticHead.
/**
* Find the index of the head of an entity.
*
* @param ent The entity mention
* @param tree The Tree for the entire sentence in which it occurs.
* @param tokens The Sentence in which it occurs
* @param setHeadSpan Whether to set the head span in the entity mention.
* @return The index of the entity head
*/
public int assignSyntacticHead(EntityMention ent, Tree tree, List<CoreLabel> tokens, boolean setHeadSpan) {
if (ent.getSyntacticHeadTokenPosition() != -1) {
return ent.getSyntacticHeadTokenPosition();
}
logger.finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.toString());
logger.finest("Flat sentence is: " + tokens);
Tree sh = null;
try {
sh = findSyntacticHead(ent, tree, tokens);
} catch (Exception e) {
logger.severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + sentenceToString(tokens));
e.printStackTrace();
} catch (AssertionError e) {
logger.severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + sentenceToString(tokens));
e.printStackTrace();
}
int headPos = ent.getExtentTokenEnd() - 1;
if (sh != null) {
CoreLabel label = (CoreLabel) sh.label();
headPos = label.get(CoreAnnotations.BeginIndexAnnotation.class);
} else {
logger.fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree);
logger.fine("Fallback strategy: will set head to last token in mention: " + tokens.get(headPos));
}
ent.setHeadTokenPosition(headPos);
if (setHeadSpan) {
// set the head span to match exactly the syntactic head
// this is needed for some corpora where the head span is not given
ent.setHeadTokenSpan(new Span(headPos, headPos + 1));
}
return headPos;
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class GenericDataSetReader method safeHead.
private Tree safeHead(Tree top) {
Tree head = top.headTerminal(headFinder);
if (head != null)
return head;
// if no head found return the right-most leaf
List<Tree> leaves = top.getLeaves();
if (leaves.size() > 0)
return leaves.get(leaves.size() - 1);
// fallback: return top
return top;
}
use of edu.stanford.nlp.trees.Tree in project CoreNLP by stanfordnlp.
the class MachineReading method assignSyntacticHeadToEntities.
private void assignSyntacticHeadToEntities(Annotation corpus) {
assert (corpus != null);
assert (corpus.get(SentencesAnnotation.class) != null);
for (CoreMap sent : corpus.get(SentencesAnnotation.class)) {
List<CoreLabel> tokens = sent.get(TokensAnnotation.class);
assert (tokens != null);
Tree tree = sent.get(TreeAnnotation.class);
if (MachineReadingProperties.forceGenerationOfIndexSpans) {
tree.indexSpans(0);
}
assert (tree != null);
if (sent.get(EntityMentionsAnnotation.class) != null) {
for (EntityMention e : sent.get(EntityMentionsAnnotation.class)) {
reader.assignSyntacticHead(e, tree, tokens, true);
}
}
}
}
Aggregations