use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method findSyntacticHead.
protected Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
// mention ends with 's
int endIdx = m.endIndex;
if (m.originalSpan.size() > 0) {
String lastWord = m.originalSpan.get(m.originalSpan.size() - 1).get(CoreAnnotations.TextAnnotation.class);
if ((lastWord.equals("'s") || lastWord.equals("'")) && m.originalSpan.size() != 1)
endIdx--;
}
Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
//
if (exactMatch != null) {
return safeHead(exactMatch, endIdx);
}
// context, so as to make the parser work better :-)
if (allowReparsing) {
int approximateness = 0;
List<CoreLabel> extentTokens = new ArrayList<>();
extentTokens.add(initCoreLabel("It"));
extentTokens.add(initCoreLabel("was"));
final int ADDED_WORDS = 2;
for (int i = m.startIndex; i < endIdx; i++) {
// Add everything except separated dashes! The separated dashes mess with the parser too badly.
CoreLabel label = tokens.get(i);
if (!"-".equals(label.word())) {
// necessary to copy tokens in case the parser does things like
// put new indices on the tokens
extentTokens.add((CoreLabel) label.labelFactory().newLabel(label));
} else {
approximateness++;
}
}
extentTokens.add(initCoreLabel("."));
// constrain the parse to the part we're interested in.
// Starting from ADDED_WORDS comes from skipping "It was".
// -1 to exclude the period.
// We now let it be any kind of nominal constituent, since there
// are VP and S ones
ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
List<ParserConstraint> constraints = Collections.singletonList(constraint);
Tree tree = parse(extentTokens, constraints);
// now unnecessary, as parser uses CoreLabels?
convertToCoreLabels(tree);
// remember it has ADDED_WORDS extra words at the beginning
tree.indexSpans(m.startIndex - ADDED_WORDS);
Tree subtree = findPartialSpan(tree, m.startIndex);
// There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
// Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
// passed the right end (that is, just that final period).
Tree extentHead = safeHead(subtree, endIdx);
assert (extentHead != null);
// extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
// Because we deleted dashes, it's index will be >= the index in the extent parse tree
CoreLabel l = (CoreLabel) extentHead.label();
Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
assert (realHead != null);
return realHead;
}
// If reparsing wasn't allowed, try to find a span in the tree
// which happens to have the head
Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
if (wordMatch != null) {
Tree head = safeHead(wordMatch, endIdx);
if (head != null) {
int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
if (index >= m.startIndex && index < endIdx) {
return head;
}
}
}
// If that didn't work, guess that it's the last word
int lastNounIdx = endIdx - 1;
for (int i = m.startIndex; i < m.endIndex; i++) {
if (tokens.get(i).tag().startsWith("N"))
lastNounIdx = i;
else if (tokens.get(i).tag().startsWith("W"))
break;
}
List<Tree> leaves = root.getLeaves();
Tree endLeaf = leaves.get(lastNounIdx);
return endLeaf;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method extractNamedEntityMentions.
protected static void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
String preNE = "O";
int beginIndex = -1;
for (CoreLabel w : sent) {
String nerString = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (!nerString.equals(preNE)) {
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
if (!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
if (w.get(CoreAnnotations.TextAnnotation.class).equals("'s"))
endIndex++;
IntPair mSpan = new IntPair(beginIndex, endIndex);
// attached to the previous NER by the earlier heuristic
if (beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
beginIndex = endIndex;
preNE = nerString;
}
}
// NE at the end of sentence
if (!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
IntPair mSpan = new IntPair(beginIndex, sent.size());
if (!mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), dependency, new ArrayList<>(sent.subList(beginIndex, sent.size())));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Mention method isListLike.
private boolean isListLike() {
// See if this mention looks to be a conjunction of things
// Check for "or" and "and" and ","
int commas = 0;
// boolean firstLabelLike = false;
// if (originalSpan.size() > 1) {
// String w = originalSpan.get(1).word();
// firstLabelLike = (w.equals(":") || w.equals("-"));
// }
String mentionSpanString = spanToString();
String subTreeSpanString = StringUtils.joinWords(mentionSubTree.yieldWords(), " ");
if (subTreeSpanString.equals(mentionSpanString)) {
// subtree represents this mention well....
List<Tree> children = mentionSubTree.getChildrenAsList();
for (Tree t : children) {
String label = t.value();
String ner = null;
if (t.isLeaf()) {
ner = ((CoreLabel) t.getLeaves().get(0).label()).ner();
}
if ("CC".equals(label)) {
// Check NER type
if (ner == null || "O".equals(ner)) {
return true;
}
} else if (label.equals(",")) {
if (ner == null || "O".equals(ner)) {
commas++;
}
}
}
}
if (commas <= 2) {
// look at the string for and/or
boolean first = true;
for (CoreLabel t : originalSpan) {
String tag = t.tag();
String ner = t.ner();
String w = t.word();
if (tag.equals("TO") || tag.equals("IN") || tag.startsWith("VB")) {
// prepositions and verbs are too hard for us
return false;
}
if (!first) {
if (w.equalsIgnoreCase("and") || w.equalsIgnoreCase("or")) {
// Check NER type
if (ner == null || "O".equals(ner)) {
return true;
}
}
}
first = false;
}
}
return (commas > 2);
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class Mention method removePhraseAfterHead.
/** Remove any clause after headword */
public String removePhraseAfterHead() {
String removed = "";
int posComma = -1;
int posWH = -1;
for (int i = 0; i < this.originalSpan.size(); i++) {
CoreLabel w = this.originalSpan.get(i);
if (posComma == -1 && w.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(","))
posComma = this.startIndex + i;
if (posWH == -1 && w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("W"))
posWH = this.startIndex + i;
}
if (posComma != -1 && this.headIndex < posComma) {
StringBuilder os = new StringBuilder();
for (int i = 0; i < posComma - this.startIndex; i++) {
if (i > 0)
os.append(" ");
os.append(this.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class));
}
removed = os.toString();
}
if (posComma == -1 && posWH != -1 && this.headIndex < posWH) {
StringBuilder os = new StringBuilder();
for (int i = 0; i < posWH - this.startIndex; i++) {
if (i > 0)
os.append(" ");
os.append(this.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class));
}
removed = os.toString();
}
if (posComma == -1 && posWH == -1) {
removed = this.spanToString();
}
return removed;
}
use of edu.stanford.nlp.ling.CoreLabel in project CoreNLP by stanfordnlp.
the class MentionExtractor method mergeLabels.
/**
* Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.
* The original value() of the Tree nodes is preserved, and otherwise the label of tree
* leaves becomes the label from the List.
*/
// todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this?
public static void mergeLabels(Tree tree, List<CoreLabel> sentence) {
int idx = 0;
for (Tree t : tree.getLeaves()) {
CoreLabel cl = sentence.get(idx++);
String value = t.value();
cl.set(CoreAnnotations.ValueAnnotation.class, value);
t.setLabel(cl);
}
tree.indexLeaves();
}
Aggregations