use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method fromProtoNoTokens.
private Mention fromProtoNoTokens(CoreNLPProtos.Mention protoMention) {
Mention returnMention = new Mention();
// set enums
if (protoMention.getMentionType() != null && !protoMention.getMentionType().equals("")) {
returnMention.mentionType = Dictionaries.MentionType.valueOf(protoMention.getMentionType());
}
if (protoMention.getNumber() != null && !protoMention.getNumber().equals("")) {
returnMention.number = Dictionaries.Number.valueOf(protoMention.getNumber());
}
if (protoMention.getGender() != null && !protoMention.getGender().equals("")) {
returnMention.gender = Dictionaries.Gender.valueOf(protoMention.getGender());
}
if (protoMention.getAnimacy() != null && !protoMention.getAnimacy().equals("")) {
returnMention.animacy = Dictionaries.Animacy.valueOf(protoMention.getAnimacy());
}
if (protoMention.getPerson() != null && !protoMention.getPerson().equals("")) {
returnMention.person = Dictionaries.Person.valueOf(protoMention.getPerson());
}
// TO DO: if the original Mention had "" for this field it will be lost, should deal with this problem
if (!protoMention.getHeadString().equals("")) {
returnMention.headString = protoMention.getHeadString();
}
// TO DO: if the original Mention had "" for this field it will be lost, should deal with this problem
if (!protoMention.getNerString().equals("")) {
returnMention.nerString = protoMention.getNerString();
}
returnMention.startIndex = protoMention.getStartIndex();
returnMention.endIndex = protoMention.getEndIndex();
returnMention.headIndex = protoMention.getHeadIndex();
returnMention.mentionID = protoMention.getMentionID();
returnMention.originalRef = protoMention.getOriginalRef();
returnMention.goldCorefClusterID = protoMention.getGoldCorefClusterID();
returnMention.corefClusterID = protoMention.getCorefClusterID();
returnMention.mentionNum = protoMention.getMentionNum();
returnMention.sentNum = protoMention.getSentNum();
returnMention.utter = protoMention.getUtter();
returnMention.paragraph = protoMention.getParagraph();
returnMention.isSubject = protoMention.getIsSubject();
returnMention.isDirectObject = protoMention.getIsDirectObject();
returnMention.isIndirectObject = protoMention.getIsIndirectObject();
returnMention.isPrepositionObject = protoMention.getIsPrepositionObject();
returnMention.hasTwin = protoMention.getHasTwin();
returnMention.generic = protoMention.getGeneric();
returnMention.isSingleton = protoMention.getIsSingleton();
// handle the sets of Strings
if (protoMention.getDependentsCount() != 0) {
returnMention.dependents = new HashSet<>();
returnMention.dependents.addAll(protoMention.getDependentsList());
}
if (protoMention.getPreprocessedTermsCount() != 0) {
returnMention.preprocessedTerms = new ArrayList<>();
returnMention.preprocessedTerms.addAll(protoMention.getPreprocessedTermsList());
}
return returnMention;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class NameMatch method checkEntityMatch.
public boolean checkEntityMatch(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict, Set<Mention> roleSet) {
Boolean matched = false;
Mention mainMention = mentionCluster.getRepresentativeMention();
Mention antMention = potentialAntecedent.getRepresentativeMention();
// Check if the representative mentions are compatible
if (isNamedMention(mainMention, dict, roleSet) && isNamedMention(antMention, dict, roleSet)) {
if (mainMention.originalSpan.size() > minTokens || antMention.originalSpan.size() > minTokens) {
if (CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent, ignoreGender)) {
if (supportedNerTypes.contains(mainMention.nerString) || supportedNerTypes.contains(antMention.nerString)) {
matched = mentionMatcher.isCompatible(mainMention, antMention);
if (matched != null) {
//Redwood.log("Match '" + mainMention + "' with '" + antMention + "' => " + matched);
if (!matched) {
document.addIncompatible(mainMention, antMention);
}
} else {
matched = false;
}
}
}
}
}
return matched;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class Sieve method resolveMention.
public String resolveMention(Document document, Dictionaries dict, Properties props) throws Exception {
StringBuilder sbLog = new StringBuilder();
if (HybridCorefProperties.debug(props)) {
sbLog.append("=======================================================");
sbLog.append(HybridCorefPrinter.printRawDoc(document, true, true));
}
for (List<Mention> mentionsInSent : document.predictedMentions) {
for (int mIdx = 0; mIdx < mentionsInSent.size(); mIdx++) {
Mention m = mentionsInSent.get(mIdx);
if (skipMentionType(m, props))
continue;
findCoreferentAntecedent(m, mIdx, document, dict, props, sbLog);
}
}
return sbLog.toString();
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method removeSpuriousMentionsZh.
protected void removeSpuriousMentionsZh(Annotation doc, List<List<Mention>> predictedMentions, Dictionaries dict, boolean removeNested) {
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
// this goes through each sentence -- predictedMentions has a list for each sentence
for (int i = 0, sz = predictedMentions.size(); i < sz; i++) {
List<Mention> mentions = predictedMentions.get(i);
List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
Set<Mention> remove = Generics.newHashSet();
for (Mention m : mentions) {
if (m.headWord.ner().matches("PERCENT|MONEY|QUANTITY|CARDINAL")) {
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING number NER: " + m.spanToString());
} else if (m.originalSpan.size() == 1 && m.headWord.tag().equals("CD")) {
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING number: " + m.spanToString());
} else if (dict.removeWords.contains(m.spanToString())) {
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING removeWord: " + m.spanToString());
} else if (mentionContainsRemoveChars(m, dict.removeChars)) {
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING removeChars: " + m.spanToString());
} else if (m.headWord.tag().equals("PU")) {
// punctuation-only mentions
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING Punctuation only mention: " + m.spanToString());
} else if (mentionIsDemonym(m, dict.countries)) {
// demonyms -- this seems to be a no-op on devset. Maybe not working?
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING Removed demonym: " + m.spanToString());
} else if (m.spanToString().equals("问题") && m.startIndex > 0 && sent.get(m.startIndex - 1).word().endsWith("没")) {
// 没 问题 - this is maybe okay but having 问题 on removeWords was dangerous
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING Removed meiyou: " + m.spanToString());
} else if (mentionIsRangren(m, sent)) {
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING Removed rangren: " + m.spanToString());
} else if (m.spanToString().equals("你") && m.startIndex < sent.size() - 1 && sent.get(m.startIndex + 1).word().startsWith("知道")) {
// 你 知道
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING Removed nizhidao: " + m.spanToString());
// The words that used to be in this case are now handled more generallyin removeCharsZh
// } else if (m.spanToString().contains("什么") || m.spanToString().contains("多少")) {
// remove.add(m);
// if (VERBOSE) log.info("MENTION FILTERING Removed many/few mention ending: " + m.spanToString());
} else if (m.spanToString().endsWith("的")) {
remove.add(m);
if (VERBOSE)
log.info("MENTION FILTERING Removed de ending mention: " + m.spanToString());
// omit this case, it decreases performance. A few useful interrogative pronouns are now in the removeChars list
// } else if (mentionIsInterrogativePronoun(m, dict.interrogativePronouns)) {
// remove.add(m);
// if (VERBOSE) log.info("MENTION FILTERING Removed interrogative pronoun: " + m.spanToString());
}
// 的 handling
// if(m.startIndex>0 && sent.get(m.startIndex-1).word().equals("的")) {
// // remove.add(m);
// Tree t = sentences.get(i).get(TreeAnnotation.class);
// Tree mTree = m.mentionSubTree;
// if(mTree==null) continue;
// for(Tree p : t.pathNodeToNode(mTree, t)) {
// if(mTree==p) continue;
// if(p.value().equals("NP")) {
// remove.add(m);
// }
// }
// }
}
// nested mention with shared headword (except apposition, enumeration): pick larger one
if (removeNested) {
for (Mention m1 : mentions) {
for (Mention m2 : mentions) {
if (m1 == m2 || remove.contains(m1) || remove.contains(m2))
continue;
if (m1.sentNum == m2.sentNum && m1.headWord == m2.headWord && m2.insideIn(m1)) {
if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") || sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) {
continue;
}
remove.add(m2);
}
}
}
}
mentions.removeAll(remove);
}
// for each sentence
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class CorefMentionFinder method findHead.
public void findHead(CoreMap s, List<Mention> mentions) {
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
tree.indexSpans(0);
for (Mention m : mentions) {
if (lang == Locale.CHINESE) {
findHeadChinese(sent, m);
} else {
CoreLabel head = (CoreLabel) findSyntacticHead(m, tree, sent).label();
m.headIndex = head.get(CoreAnnotations.IndexAnnotation.class) - 1;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH);
}
int start = m.headIndex - m.startIndex;
if (start < 0 || start >= m.originalSpan.size()) {
Redwood.log("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord);
Redwood.log("Setting head string to entire mention");
m.headIndex = m.startIndex;
m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex);
m.headString = m.originalSpan.toString();
}
}
}
Aggregations