use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class MentionDetectionClassifier method extractFeatures.
public static Counter<String> extractFeatures(Mention p, Set<Mention> shares, Set<String> neStrings, Dictionaries dict, Properties props) {
Counter<String> features = new ClassicCounter<>();
String span = p.lowercaseNormalizedSpanString();
String ner = p.headWord.ner();
int sIdx = p.startIndex;
int eIdx = p.endIndex;
List<CoreLabel> sent = p.sentenceWords;
CoreLabel preWord = (sIdx == 0) ? null : sent.get(sIdx - 1);
CoreLabel nextWord = (eIdx == sent.size()) ? null : sent.get(eIdx);
CoreLabel firstWord = p.originalSpan.get(0);
CoreLabel lastWord = p.originalSpan.get(p.originalSpan.size() - 1);
features.incrementCount("B-NETYPE-" + ner);
if (neStrings.contains(span)) {
features.incrementCount("B-NE-STRING-EXIST");
if ((preWord == null || !preWord.ner().equals(ner)) && (nextWord == null || !nextWord.ner().equals(ner))) {
features.incrementCount("B-NE-FULLSPAN");
}
}
if (preWord != null)
features.incrementCount("B-PRECEDINGWORD-" + preWord.word());
if (nextWord != null)
features.incrementCount("B-FOLLOWINGWORD-" + nextWord.word());
if (preWord != null)
features.incrementCount("B-PRECEDINGPOS-" + preWord.tag());
if (nextWord != null)
features.incrementCount("B-FOLLOWINGPOS-" + nextWord.tag());
features.incrementCount("B-FIRSTWORD-" + firstWord.word());
features.incrementCount("B-FIRSTPOS-" + firstWord.tag());
features.incrementCount("B-LASTWORD-" + lastWord.word());
features.incrementCount("B-LASTWORD-" + lastWord.tag());
for (Mention s : shares) {
if (s == p)
continue;
if (s.insideIn(p)) {
features.incrementCount("B-BIGGER-THAN-ANOTHER");
break;
}
}
for (Mention s : shares) {
if (s == p)
continue;
if (p.insideIn(s)) {
features.incrementCount("B-SMALLER-THAN-ANOTHER");
break;
}
}
return features;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class MentionDetectionClassifier method classifyMentions.
public void classifyMentions(List<List<Mention>> predictedMentions, Dictionaries dict, Properties props) {
Set<String> neStrings = Generics.newHashSet();
for (List<Mention> predictedMention : predictedMentions) {
for (Mention m : predictedMention) {
String ne = m.headWord.ner();
if (ne.equals("O"))
continue;
for (CoreLabel cl : m.originalSpan) {
if (!cl.ner().equals(ne))
continue;
}
neStrings.add(m.lowercaseNormalizedSpanString());
}
}
for (List<Mention> predicts : predictedMentions) {
Map<Integer, Set<Mention>> headPositions = Generics.newHashMap();
for (Mention p : predicts) {
if (!headPositions.containsKey(p.headIndex))
headPositions.put(p.headIndex, Generics.newHashSet());
headPositions.get(p.headIndex).add(p);
}
Set<Mention> remove = Generics.newHashSet();
for (int hPos : headPositions.keySet()) {
Set<Mention> shares = headPositions.get(hPos);
if (shares.size() > 1) {
Counter<Mention> probs = new ClassicCounter<>();
for (Mention p : shares) {
double trueProb = probabilityOf(p, shares, neStrings, dict, props);
probs.incrementCount(p, trueProb);
}
// add to remove
Mention keep = Counters.argmax(probs, (m1, m2) -> m1.spanToString().compareTo(m2.spanToString()));
probs.remove(keep);
remove.addAll(probs.keySet());
}
}
for (Mention r : remove) {
predicts.remove(r);
}
}
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method extractNamedEntityMentions.
protected static void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
String preNE = "O";
int beginIndex = -1;
for (CoreLabel w : sent) {
String nerString = w.ner();
if (!nerString.equals(preNE)) {
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
if (!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
if (w.get(CoreAnnotations.TextAnnotation.class).equals("'s") && w.tag().equals("POS")) {
endIndex++;
}
IntPair mSpan = new IntPair(beginIndex, endIndex);
// attached to the previous NER by the earlier heuristic
if (beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
beginIndex = endIndex;
preNE = nerString;
}
}
// NE at the end of sentence
if (!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
IntPair mSpan = new IntPair(beginIndex, sent.size());
if (!mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, sent.size())));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method extractNPorPRP.
public void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
tree.indexLeaves();
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
TregexPattern tgrepPattern = npOrPrpMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
while (matcher.find()) {
Tree t = matcher.getMatch();
List<Tree> mLeaves = t.getLeaves();
int beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
int endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
//if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
IntPair mSpan = new IntPair(beginIdx, endIdx);
if (!mentionSpanSet.contains(mSpan) && (lang == Locale.CHINESE || !insideNE(mSpan, namedEntitySpanSet))) {
// if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP")) ) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t);
mentions.add(m);
mentionSpanSet.add(mSpan);
// if(m.originalSpan.size() > 1) {
// boolean isNE = true;
// for(CoreLabel cl : m.originalSpan) {
// if(!cl.tag().startsWith("NNP")) isNE = false;
// }
// if(isNE) {
// namedEntitySpanSet.add(mSpan);
// }
// }
}
}
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method removeSpuriousMentionsEn.
/** Filter out all spurious mentions
*/
@Override
public void removeSpuriousMentionsEn(Annotation doc, List<List<Mention>> predictedMentions, Dictionaries dict) {
Set<String> standAlones = new HashSet<>();
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
for (int i = 0; i < predictedMentions.size(); i++) {
CoreMap s = sentences.get(i);
List<Mention> mentions = predictedMentions.get(i);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Set<Mention> remove = Generics.newHashSet();
for (Mention m : mentions) {
String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class);
// pleonastic it
if (isPleonastic(m, tree)) {
remove.add(m);
}
// non word such as 'hmm'
if (dict.nonWords.contains(m.headString))
remove.add(m);
// quantRule : not starts with 'any', 'all' etc
if (m.originalSpan.size() > 0) {
String firstWord = m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH);
if (firstWord.matches("none|no|nothing|not")) {
remove.add(m);
}
// if(dict.quantifiers.contains(firstWord)) remove.add(m);
}
// partitiveRule
if (partitiveRule(m, sent, dict)) {
remove.add(m);
}
// bareNPRule
if (headPOS.equals("NN") && !dict.temporals.contains(m.headString) && (m.originalSpan.size() == 1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) {
remove.add(m);
}
if (m.headString.equals("%")) {
remove.add(m);
}
if (headNE.equals("PERCENT") || headNE.equals("MONEY")) {
remove.add(m);
}
// check if the mention is noun and the next word is not noun
if (dict.isAdjectivalDemonym(m.spanToString())) {
remove.add(m);
}
// stop list (e.g., U.S., there)
if (inStopList(m))
remove.add(m);
}
// nested mention with shared headword (except apposition, enumeration): pick larger one
for (Mention m1 : mentions) {
for (Mention m2 : mentions) {
if (m1 == m2 || remove.contains(m1) || remove.contains(m2))
continue;
if (m1.sentNum == m2.sentNum && m1.headWord == m2.headWord && m2.insideIn(m1)) {
if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") || sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) {
continue;
}
remove.add(m2);
}
}
}
mentions.removeAll(remove);
}
}
Aggregations