use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class ChineseHcorefDemo method main.
public static void main(String[] args) throws Exception {
long startTime = System.currentTimeMillis();
String text = "俄罗斯 航空 公司 一 名 官员 在 9号 说 , " + "米洛舍维奇 的 儿子 马可·米洛舍维奇 9号 早上 持 外交 护照 从 俄国 首都 莫斯科 搭机 飞往 中国 大陆 北京 , " + "可是 就 在 稍后 就 返回 莫斯科 。 " + "这 名 俄国 航空 公司 官员 说 马可 是 因为 护照 问题 而 在 北京 机场 被 中共 遣返 莫斯科 。 " + "北京 机场 方面 的 这 项 举动 清楚 显示 中共 有意 放弃 在 总统 大选 落败 的 前 南斯拉夫 总统 米洛舍维奇 , " + "因此 他 在 南斯拉夫 受到 民众 厌恶 的 儿子 马可 才 会 在 北京 机场 被 中共 当局 送回 莫斯科 。 " + "马可 持 外交 护照 能够 顺利 搭机 离开 莫斯科 , 但是 却 在 北京 受阻 , 可 算是 踢到 了 铁板 。 " + "可是 这 项 消息 和 先前 外界 谣传 中共 当局 准备 提供 米洛舍维奇 和 他 的 家人 安全 庇护所 有 着 很 大 的 出入 ," + " 一般 认为 在 去年 米洛舍维奇 挥兵 攻打 科索沃 境内 阿尔巴尼亚 一 分离主义 分子 的 时候 , " + "强力 反对 北约 组织 攻击 南斯拉夫 的 中共 , 会 全力 保护 米洛舍维奇 和 他 的 家人 及 亲信 。 " + "可是 从 9号 马可 被 送回 莫斯科 一 事 看 起来 , 中共 很 可能 会 放弃 米洛舍维奇 。";
args = new String[] { "-props", "edu/stanford/nlp/hcoref/properties/zh-coref-default.properties" };
Annotation document = new Annotation(text);
Properties props = StringUtils.argsToProperties(args);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(document);
System.out.println("---");
System.out.println("coref chains");
for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
System.out.println("\t" + cc);
}
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
System.out.println("---");
System.out.println("mentions");
for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
System.out.println("\t" + m);
}
}
long endTime = System.currentTimeMillis();
long time = (endTime - startTime) / 1000;
System.out.println("Running time " + time / 60 + "min " + time % 60 + "s");
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class DeterministicCorefSieve method coreferent.
/**
* Checks if two clusters are coreferent according to our sieve pass constraints
* @param document
* @throws Exception
*/
public boolean coreferent(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention mention2, Mention ant, Dictionaries dict, Set<Mention> roleSet) throws Exception {
boolean ret = false;
Mention mention = mentionCluster.getRepresentativeMention();
if (flags.USE_INCOMPATIBLES) {
// Allows definite no's from previous sieves to propagate down
if (document.isIncompatible(mentionCluster, potentialAntecedent)) {
return false;
}
}
if (flags.DO_PRONOUN && Math.abs(mention2.sentNum - ant.sentNum) > 3 && mention2.person != Person.I && mention2.person != Person.YOU) {
return false;
}
if (mention2.lowercaseNormalizedSpanString().equals("this") && Math.abs(mention2.sentNum - ant.sentNum) > 3) {
return false;
}
if (mention2.person == Person.YOU && document.docType == DocType.ARTICLE && mention2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) {
return false;
}
if (document.conllDoc != null) {
if (ant.generic && ant.person == Person.YOU)
return false;
if (mention2.generic)
return false;
}
// chinese newswire contains coref nested NPs with shared headword Chen & Ng
if (lang != Locale.CHINESE || document.docInfo == null || !document.docInfo.getOrDefault("DOC_ID", "").contains("nw")) {
if (mention2.insideIn(ant) || ant.insideIn(mention2))
return false;
}
if (flags.USE_SPEAKERMATCH) {
String mSpeaker = mention2.headWord.get(SpeakerAnnotation.class);
String aSpeaker = ant.headWord.get(SpeakerAnnotation.class);
// <I> from same speaker
if (mention2.person == Person.I && ant.person == Person.I)
return (mSpeaker.equals(aSpeaker));
// <I> - speaker
if ((mention2.person == Person.I && mSpeaker.equals(Integer.toString(ant.mentionID))) || (ant.person == Person.I && aSpeaker.equals(Integer.toString(mention2.mentionID))))
return true;
}
if (flags.USE_DISCOURSEMATCH) {
String mString = mention.lowercaseNormalizedSpanString();
String antString = ant.lowercaseNormalizedSpanString();
// mention and ant both belong to the same speaker cluster
if (mention.speakerInfo != null && mention.speakerInfo == ant.speakerInfo) {
return true;
}
// (I - I) in the same speaker's quotation.
if (mention.number == Number.SINGULAR && dict.firstPersonPronouns.contains(mString) && ant.number == Number.SINGULAR && dict.firstPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, mention, ant)) {
return true;
}
// (speaker - I)
if ((mention.number == Number.SINGULAR && dict.firstPersonPronouns.contains(mString)) && CorefRules.antecedentIsMentionSpeaker(document, mention, ant, dict)) {
if (mention.speakerInfo == null && ant.speakerInfo != null) {
mention.speakerInfo = ant.speakerInfo;
}
return true;
}
// (I - speaker)
if ((ant.number == Number.SINGULAR && dict.firstPersonPronouns.contains(antString)) && CorefRules.antecedentIsMentionSpeaker(document, ant, mention, dict)) {
if (ant.speakerInfo == null && mention.speakerInfo != null) {
ant.speakerInfo = mention.speakerInfo;
}
return true;
}
// Can be iffy if more than two speakers... but still should be okay most of the time
if (dict.secondPersonPronouns.contains(mString) && dict.secondPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, mention, ant)) {
return true;
}
// previous I - you or previous you - I in two person conversation
if (((mention.person == Person.I && ant.person == Person.YOU || (mention.person == Person.YOU && ant.person == Person.I)) && (mention.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - ant.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) && document.docType == DocType.CONVERSATION)) {
return true;
}
if (dict.reflexivePronouns.contains(mention.headString) && CorefRules.entitySubjectObject(mention, ant)) {
return true;
}
}
if (!flags.USE_EXACTSTRINGMATCH && !flags.USE_RELAXED_EXACTSTRINGMATCH && !flags.USE_APPOSITION && !flags.USE_WORDS_INCLUSION) {
for (Mention m : mentionCluster.getCorefMentions()) {
for (Mention a : potentialAntecedent.getCorefMentions()) {
// vv gabor - re-enabled code (seems to improve performance) vv
if (m.person != Person.I && a.person != Person.I && (CorefRules.antecedentIsMentionSpeaker(document, m, a, dict) || CorefRules.antecedentIsMentionSpeaker(document, a, m, dict))) {
document.addIncompatible(m, a);
return false;
}
// ^^ end block of code in question ^^
int dist = Math.abs(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - a.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
if (document.docType != DocType.ARTICLE && dist == 1 && !CorefRules.entitySameSpeaker(document, m, a)) {
String mSpeaker = document.speakers.get(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
String aSpeaker = document.speakers.get(a.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
if (m.person == Person.I && a.person == Person.I) {
document.addIncompatible(m, a);
return false;
}
if (m.person == Person.YOU && a.person == Person.YOU) {
document.addIncompatible(m, a);
return false;
}
// This is weak since we can refer to both speakers
if (m.person == Person.WE && a.person == Person.WE) {
document.addIncompatible(m, a);
return false;
}
}
}
}
if (document.docType == DocType.ARTICLE) {
for (Mention m : mentionCluster.getCorefMentions()) {
for (Mention a : potentialAntecedent.getCorefMentions()) {
if (CorefRules.entitySubjectObject(m, a)) {
document.addIncompatible(m, a);
return false;
}
}
}
}
}
// Incompatibility constraints - do before match checks
if (flags.USE_iwithini && CorefRules.entityIWithinI(mention, ant, dict)) {
document.addIncompatible(mention, ant);
return false;
}
// Match checks
if (flags.USE_EXACTSTRINGMATCH && CorefRules.entityExactStringMatch(mention, ant, dict, roleSet)) {
return true;
}
// }
if (flags.USE_NAME_MATCH && checkEntityMatch(document, mentionCluster, potentialAntecedent, dict, roleSet)) {
ret = true;
}
if (flags.USE_RELAXED_EXACTSTRINGMATCH && CorefRules.entityRelaxedExactStringMatch(mentionCluster, potentialAntecedent, mention, ant, dict, roleSet)) {
return true;
}
if (flags.USE_APPOSITION && CorefRules.entityIsApposition(mentionCluster, potentialAntecedent, mention, ant)) {
return true;
}
if (flags.USE_PREDICATENOMINATIVES && CorefRules.entityIsPredicateNominatives(mentionCluster, potentialAntecedent, mention, ant)) {
return true;
}
if (flags.USE_ACRONYM && CorefRules.entityIsAcronym(document, mentionCluster, potentialAntecedent)) {
return true;
}
if (flags.USE_RELATIVEPRONOUN && CorefRules.entityIsRelativePronoun(mention, ant)) {
return true;
}
if (flags.USE_DEMONYM && mention.isDemonym(ant, dict)) {
return true;
}
if (flags.USE_ROLEAPPOSITION) {
if (lang == Locale.CHINESE)
ret = false;
else if (CorefRules.entityIsRoleAppositive(mentionCluster, potentialAntecedent, mention, ant, dict))
ret = true;
}
if (flags.USE_INCLUSION_HEADMATCH && CorefRules.entityHeadsAgree(mentionCluster, potentialAntecedent, mention, ant, dict)) {
ret = true;
}
if (flags.USE_RELAXED_HEADMATCH && CorefRules.entityRelaxedHeadsAgreeBetweenMentions(mentionCluster, potentialAntecedent, mention, ant)) {
ret = true;
}
if (flags.USE_WORDS_INCLUSION && ret && !CorefRules.entityWordsIncluded(mentionCluster, potentialAntecedent, mention, ant)) {
return false;
}
if (flags.USE_INCOMPATIBLE_MODIFIER && ret && CorefRules.entityHaveIncompatibleModifier(mentionCluster, potentialAntecedent)) {
return false;
}
if (flags.USE_PROPERHEAD_AT_LAST && ret && !CorefRules.entitySameProperHeadLastWord(mentionCluster, potentialAntecedent, mention, ant)) {
return false;
}
if (flags.USE_ATTRIBUTES_AGREE && !CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent)) {
return false;
}
if (flags.USE_DIFFERENT_LOCATION && CorefRules.entityHaveDifferentLocation(mention, ant, dict)) {
if (flags.USE_PROPERHEAD_AT_LAST && ret && mention.goldCorefClusterID != ant.goldCorefClusterID) {
}
return false;
}
if (flags.USE_NUMBER_IN_MENTION && CorefRules.entityNumberInLaterMention(mention, ant)) {
if (flags.USE_PROPERHEAD_AT_LAST && ret && mention.goldCorefClusterID != ant.goldCorefClusterID) {
}
return false;
}
if (flags.USE_DISTANCE && CorefRules.entityTokenDistance(mention2, ant)) {
return false;
}
if (flags.USE_COREF_DICT) {
// Head match
if (ant.headWord.lemma().equals(mention2.headWord.lemma()))
return false;
// Constraint: ignore pairs commonNoun - properNoun
if (ant.mentionType != MentionType.PROPER && (mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP") || !mention2.headWord.word().substring(1).equals(mention2.headWord.word().substring(1).toLowerCase())))
return false;
// Constraint: ignore plurals
if (ant.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS") && mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS"))
return false;
// Constraint: ignore mentions with indefinite determiners
if (dict.indefinitePronouns.contains(ant.originalSpan.get(0).lemma()) || dict.indefinitePronouns.contains(mention2.originalSpan.get(0).lemma()))
return false;
// Constraint: ignore coordinated mentions
if (ant.isCoordinated() || mention2.isCoordinated())
return false;
// Constraint: context incompatibility
if (CorefRules.contextIncompatible(mention2, ant, dict))
return false;
// Constraint: sentence context incompatibility when the mentions are common nouns
if (CorefRules.sentenceContextIncompatible(mention2, ant, dict))
return false;
if (CorefRules.entityClusterAllCorefDictionary(mentionCluster, potentialAntecedent, dict, 1, 8))
return true;
if (CorefRules.entityCorefDictionary(mention, ant, dict, 2, 2))
return true;
if (CorefRules.entityCorefDictionary(mention, ant, dict, 3, 2))
return true;
if (CorefRules.entityCorefDictionary(mention, ant, dict, 4, 2))
return true;
}
if (flags.DO_PRONOUN) {
Mention m;
if (mention.predicateNominatives != null && mention.predicateNominatives.contains(mention2)) {
m = mention2;
} else {
m = mention;
}
boolean mIsPronoun = (m.isPronominal() || dict.allPronouns.contains(m.toString()));
boolean attrAgree = HybridCorefProperties.useDefaultPronounAgreement(props) ? CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent) : CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent, lang);
if (mIsPronoun && attrAgree) {
if (dict.demonymSet.contains(ant.lowercaseNormalizedSpanString()) && dict.notOrganizationPRP.contains(m.headString)) {
document.addIncompatible(m, ant);
return false;
}
if (CorefRules.entityPersonDisagree(document, mentionCluster, potentialAntecedent, dict)) {
document.addIncompatible(m, ant);
return false;
}
return true;
}
}
if (flags.USE_CHINESE_HEAD_MATCH) {
if (mention2.headWord == ant.headWord && mention2.insideIn(ant)) {
if (!document.isCoref(mention2, ant)) {
// TODO: exclude conjunction
// log.info("error in chinese head match: "+mention2.spanToString()+"\t"+ant.spanToString());
}
return true;
}
}
return ret;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class CorefUtils method heuristicFilter.
public static Map<Integer, List<Integer>> heuristicFilter(List<Mention> sortedMentions, int maxMentionDistance, int maxMentionDistanceWithStringMatch) {
Map<String, List<Mention>> wordToMentions = new HashMap<>();
for (int i = 0; i < sortedMentions.size(); i++) {
Mention m = sortedMentions.get(i);
for (String word : getContentWords(m)) {
wordToMentions.putIfAbsent(word, new ArrayList<>());
wordToMentions.get(word).add(m);
}
}
Map<Integer, List<Integer>> mentionToCandidateAntecedents = new HashMap<>();
for (int i = 0; i < sortedMentions.size(); i++) {
Mention m = sortedMentions.get(i);
List<Integer> candidateAntecedents = new ArrayList<>();
for (int j = Math.max(0, i - maxMentionDistance); j < i; j++) {
candidateAntecedents.add(sortedMentions.get(j).mentionID);
}
for (String word : getContentWords(m)) {
List<Mention> withStringMatch = wordToMentions.get(word);
if (withStringMatch != null) {
for (Mention match : withStringMatch) {
if (match.mentionNum < m.mentionNum && match.mentionNum >= m.mentionNum - maxMentionDistanceWithStringMatch) {
if (!candidateAntecedents.contains(match.mentionID)) {
candidateAntecedents.add(match.mentionID);
}
}
}
}
}
if (!candidateAntecedents.isEmpty()) {
mentionToCandidateAntecedents.put(m.mentionID, candidateAntecedents);
}
}
return mentionToCandidateAntecedents;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class CorefRules method entityBothHaveProper.
public static boolean entityBothHaveProper(CorefCluster mentionCluster, CorefCluster potentialAntecedent) {
boolean mentionClusterHaveProper = false;
boolean potentialAntecedentHaveProper = false;
for (Mention m : mentionCluster.corefMentions) {
if (m.mentionType == MentionType.PROPER) {
mentionClusterHaveProper = true;
break;
}
}
for (Mention a : potentialAntecedent.corefMentions) {
if (a.mentionType == MentionType.PROPER) {
potentialAntecedentHaveProper = true;
break;
}
}
return (mentionClusterHaveProper && potentialAntecedentHaveProper);
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class CorefRules method entityIsAcronym.
public static boolean entityIsAcronym(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent) {
Pair<Integer, Integer> idPair = Pair.makePair(Math.min(mentionCluster.clusterID, potentialAntecedent.clusterID), Math.max(mentionCluster.clusterID, potentialAntecedent.clusterID));
if (!document.acronymCache.containsKey(idPair)) {
boolean isAcronym = false;
for (Mention m : mentionCluster.corefMentions) {
if (m.isPronominal())
continue;
for (Mention ant : potentialAntecedent.corefMentions) {
if (isAcronym(m.originalSpan, ant.originalSpan))
isAcronym = true;
}
}
document.acronymCache.put(idPair, isAcronym);
}
return document.acronymCache.get(idPair);
}
Aggregations