use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class DocumentPreprocessor method findTwinMentionsStrict.
/** Mark twin mentions: All mention boundaries should be matched */
private static void findTwinMentionsStrict(Document doc) {
for (int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
List<Mention> golds = doc.goldMentions.get(sentNum);
List<Mention> predicts = doc.predictedMentions.get(sentNum);
// For CoNLL training there are some documents with gold mentions with the same position offsets
// See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
// (Packwood - Roth)
CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
for (Mention g : golds) {
IntPair ip = new IntPair(g.startIndex, g.endIndex);
if (goldMentionPositions.containsKey(ip)) {
StringBuilder existingMentions = new StringBuilder();
for (Mention eg : goldMentionPositions.get(ip)) {
if (existingMentions.length() > 0) {
existingMentions.append(",");
}
existingMentions.append(eg.mentionID);
}
Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
}
//assert(!goldMentionPositions.containsKey(ip));
goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
}
for (Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if (goldMentionPositions.containsKey(pos)) {
Collection<Mention> cm = goldMentionPositions.get(pos);
int minId = Integer.MAX_VALUE;
Mention g = null;
for (Mention m : cm) {
if (m.mentionID < minId) {
g = m;
minId = m.mentionID;
}
}
cm.remove(g);
p.mentionID = g.mentionID;
p.hasTwin = true;
g.hasTwin = true;
}
}
}
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method addNamedEntityStrings.
protected static void addNamedEntityStrings(CoreMap s, Set<String> neStrings, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> tokens = s.get(TokensAnnotation.class);
for (IntPair p : namedEntitySpanSet) {
StringBuilder sb = new StringBuilder();
for (int idx = p.get(0); idx < p.get(1); idx++) {
sb.append(tokens.get(idx).word()).append(" ");
}
String str = sb.toString().trim();
if (str.endsWith(" 's")) {
str = str.substring(0, str.length() - 3);
}
neStrings.add(str);
}
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method extractPremarkedEntityMentions.
protected static void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
int beginIndex = -1;
for (CoreLabel w : sent) {
MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
if (t != null) {
// Part of a mention
if (t.isStart()) {
// Start of mention
beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
}
if (t.isEnd()) {
// end of mention
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
if (beginIndex >= 0) {
IntPair mSpan = new IntPair(beginIndex, endIndex);
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
beginIndex = -1;
} else {
Redwood.log("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class) - 1) + " for " + s.get(CoreAnnotations.TextAnnotation.class));
}
}
}
}
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class DependencyCorefMentionFinder method getNPSpan.
/**
* return the left and right most node except copula relation (nsubj & cop) and some others (maybe discourse?)
* e.g., you are the person -> return "the person"
*/
private IntPair getNPSpan(IndexedWord headword, SemanticGraph dep, List<CoreLabel> sent) {
int headwordIdx = headword.index() - 1;
List<IndexedWord> children = dep.getChildList(headword);
// if(children.size()==0) return new IntPair(headwordIdx, headwordIdx); // the headword is the only word
// check if we have copula relation
IndexedWord cop = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.COPULA);
int startIdx = (cop == null) ? 0 : children.indexOf(cop) + 1;
// children which will be inside of NP
List<IndexedWord> insideNP = Generics.newArrayList();
for (int i = startIdx; i < children.size(); i++) {
IndexedWord child = children.get(i);
SemanticGraphEdge edge = dep.getEdge(headword, child);
if (edge.getRelation().getShortName().matches("dep|discourse|punct")) {
// skip
continue;
} else {
insideNP.add(child);
}
}
// the headword is the only word
if (insideNP.size() == 0)
return new IntPair(headwordIdx, headwordIdx);
Pair<IndexedWord, IndexedWord> firstChildLeftRight = SemanticGraphUtils.leftRightMostChildVertices(insideNP.get(0), dep);
Pair<IndexedWord, IndexedWord> lastChildLeftRight = SemanticGraphUtils.leftRightMostChildVertices(insideNP.get(insideNP.size() - 1), dep);
// headword can be first or last word
int beginIdx = Math.min(headwordIdx, firstChildLeftRight.first.index() - 1);
int endIdx = Math.max(headwordIdx, lastChildLeftRight.second.index() - 1);
return new IntPair(beginIdx, endIdx);
}
use of edu.stanford.nlp.util.IntPair in project CoreNLP by stanfordnlp.
the class DependencyCorefMentionFinder method extractPronounForHeadword.
private void extractPronounForHeadword(IndexedWord headword, SemanticGraph dep, CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basic = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhanced = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhanced == null) {
enhanced = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
int beginIdx = headword.index() - 1;
int endIdx = headword.index();
// handle "you all", "they both" etc
if (sent.size() > headword.index() && sent.get(headword.index()).word().matches("all|both")) {
IndexedWord c = dep.getNodeByIndex(headword.index() + 1);
SemanticGraphEdge edge = dep.getEdge(headword, c);
if (edge != null)
endIdx++;
}
IntPair mSpan = new IntPair(beginIdx, endIdx);
if (!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet))) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basic, enhanced, new ArrayList<>(sent.subList(beginIdx, endIdx)));
m.headIndex = headword.index() - 1;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.word().toLowerCase(Locale.ENGLISH);
mentions.add(m);
mentionSpanSet.add(mSpan);
}
// when pronoun is a part of conjunction (e.g., you and I)
Set<IndexedWord> conjChildren = dep.getChildrenWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT);
if (conjChildren.size() > 0) {
IntPair npSpan = getNPSpan(headword, dep, sent);
beginIdx = npSpan.get(0);
endIdx = npSpan.get(1) + 1;
// try not to have span that ends with ,
if (",".equals(sent.get(endIdx - 1).word())) {
endIdx--;
}
addMention(beginIdx, endIdx, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced);
}
}
Aggregations