use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class CoNLLMentionExtractor method extractGoldMentions.
public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) {
List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class);
List<List<Mention>> allGoldMentions = new ArrayList<>();
CollectionValuedMap<String, CoreMap> corefChainMap = conllDoc.getCorefChainMap();
for (int i = 0; i < sentences.size(); i++) {
allGoldMentions.add(new ArrayList<>());
}
int maxCorefClusterId = -1;
for (String corefIdStr : corefChainMap.keySet()) {
int id = Integer.parseInt(corefIdStr);
if (id > maxCorefClusterId) {
maxCorefClusterId = id;
}
}
int newMentionID = maxCorefClusterId + 1;
for (Map.Entry<String, Collection<CoreMap>> idChainEntry : corefChainMap.entrySet()) {
int id = Integer.parseInt(idChainEntry.getKey());
int clusterMentionCnt = 0;
for (CoreMap m : idChainEntry.getValue()) {
clusterMentionCnt++;
Mention mention = new Mention();
mention.goldCorefClusterID = id;
if (clusterMentionCnt == 1) {
// First mention in cluster
mention.mentionID = id;
mention.originalRef = -1;
} else {
mention.mentionID = newMentionID;
mention.originalRef = id;
newMentionID++;
}
if (maxID < mention.mentionID)
maxID = mention.mentionID;
int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class);
CoreMap sent = sentences.get(sentIndex);
mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class);
// will be set by arrange
mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class);
// Mention dependency graph is the enhanced dependency graph of the sentence
mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
allGoldMentions.get(sentIndex).add(mention);
}
}
return allGoldMentions;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method removeSpuriousMentionsEn.
/** Filter out all spurious mentions
*/
@Override
public void removeSpuriousMentionsEn(Annotation doc, List<List<Mention>> predictedMentions, Dictionaries dict) {
Set<String> standAlones = new HashSet<>();
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
for (int i = 0; i < predictedMentions.size(); i++) {
CoreMap s = sentences.get(i);
List<Mention> mentions = predictedMentions.get(i);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Set<Mention> remove = Generics.newHashSet();
for (Mention m : mentions) {
String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class);
// pleonastic it
if (isPleonastic(m, tree)) {
remove.add(m);
}
// non word such as 'hmm'
if (dict.nonWords.contains(m.headString))
remove.add(m);
// quantRule : not starts with 'any', 'all' etc
if (m.originalSpan.size() > 0) {
String firstWord = m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH);
if (firstWord.matches("none|no|nothing|not")) {
remove.add(m);
}
// if(dict.quantifiers.contains(firstWord)) remove.add(m);
}
// partitiveRule
if (partitiveRule(m, sent, dict)) {
remove.add(m);
}
// bareNPRule
if (headPOS.equals("NN") && !dict.temporals.contains(m.headString) && (m.originalSpan.size() == 1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) {
remove.add(m);
}
if (m.headString.equals("%")) {
remove.add(m);
}
if (headNE.equals("PERCENT") || headNE.equals("MONEY")) {
remove.add(m);
}
// check if the mention is noun and the next word is not noun
if (dict.isAdjectivalDemonym(m.spanToString())) {
remove.add(m);
}
// stop list (e.g., U.S., there)
if (inStopList(m))
remove.add(m);
}
// nested mention with shared headword (except apposition, enumeration): pick larger one
for (Mention m1 : mentions) {
for (Mention m2 : mentions) {
if (m1 == m2 || remove.contains(m1) || remove.contains(m2))
continue;
if (m1.sentNum == m2.sentNum && m1.headWord == m2.headWord && m2.insideIn(m1)) {
if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") || sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) {
continue;
}
remove.add(m2);
}
}
}
mentions.removeAll(remove);
}
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method filterPredictedMentions.
/** When mention boundaries are given */
public List<List<Mention>> filterPredictedMentions(List<List<Mention>> allGoldMentions, Annotation doc, Dictionaries dict, Properties props) {
List<List<Mention>> predictedMentions = new ArrayList<>();
for (int i = 0; i < allGoldMentions.size(); i++) {
CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
List<Mention> goldMentions = allGoldMentions.get(i);
List<Mention> mentions = new ArrayList<>();
predictedMentions.add(mentions);
mentions.addAll(goldMentions);
findHead(s, mentions);
// todo [cdm 2013]: This block seems to do nothing - the two sets are never used
Set<IntPair> mentionSpanSet = Generics.newHashSet();
Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
for (Mention m : mentions) {
mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex));
if (!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) {
namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex));
}
}
setBarePlural(mentions);
}
removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang);
return predictedMentions;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class Document method findDocType.
/** Find document type: Conversation or article */
private DocType findDocType(Dictionaries dict) {
boolean speakerChange = false;
Set<Integer> discourseWithIorYou = Generics.newHashSet();
for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
if (utterIndex != 0)
speakerChange = true;
if (speakerChange && utterIndex == 0)
return DocType.ARTICLE;
if (dict.firstPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase()) || dict.secondPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) {
discourseWithIorYou.add(utterIndex);
}
if (maxUtter < utterIndex)
maxUtter = utterIndex;
}
}
if (!speakerChange)
return DocType.ARTICLE;
// in conversation, utter index keep increasing.
return DocType.CONVERSATION;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class Document method findSpeakersInConversation.
private void findSpeakersInConversation(Dictionaries dict) {
for (List<Mention> l : predictedOrderedMentionsBySentence) {
for (Mention m : l) {
if (m.predicateNominatives == null)
continue;
for (Mention a : m.predicateNominatives) {
if (a.spanToString().toLowerCase().equals("i")) {
speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID));
}
}
}
}
List<CoreMap> paragraph = new ArrayList<>();
int paragraphUtterIndex = 0;
String nextParagraphSpeaker = "";
int paragraphOffset = 0;
for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class);
if (paragraphUtterIndex != currentUtter) {
nextParagraphSpeaker = findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
paragraphUtterIndex = currentUtter;
paragraphOffset += paragraph.size();
paragraph = new ArrayList<>();
}
paragraph.add(sent);
}
findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
}
Aggregations