use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class HybridCorefPrinter method sentenceStringWithMention.
public static String sentenceStringWithMention(int i, Document document, boolean gold, boolean printClusterID) {
StringBuilder sentStr = new StringBuilder();
List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class);
List<List<Mention>> allMentions;
if (gold) {
allMentions = document.goldMentions;
} else {
allMentions = document.predictedMentions;
}
// String filename = document.annotation.get()
int previousOffset = 0;
CoreMap sentence = sentences.get(i);
List<Mention> mentions = allMentions.get(i);
List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
String speaker = t.get(0).get(SpeakerAnnotation.class);
if (NumberMatchingRegex.isDecimalInteger(speaker))
speaker = speaker + ": " + document.predictedMentionsByID.get(Integer.parseInt(speaker)).spanToString();
sentStr.append("\tspeaker: " + speaker + " (" + t.get(0).get(UtteranceAnnotation.class) + ") ");
String[] tokens = new String[t.size()];
for (CoreLabel c : t) {
tokens[c.index() - 1] = c.word();
}
// if(previousOffset+2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) && printClusterID) {
// sentStr.append("\n");
// }
previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
Counter<Integer> startCounts = new ClassicCounter<>();
Counter<Integer> endCounts = new ClassicCounter<>();
Map<Integer, Deque<Mention>> endMentions = Generics.newHashMap();
for (Mention m : mentions) {
// if(!gold && (document.corefClusters.get(m.corefClusterID)==null || document.corefClusters.get(m.corefClusterID).getCorefMentions().size()<=1)) {
// continue;
// }
startCounts.incrementCount(m.startIndex);
endCounts.incrementCount(m.endIndex);
if (!endMentions.containsKey(m.endIndex))
endMentions.put(m.endIndex, new ArrayDeque<>());
endMentions.get(m.endIndex).push(m);
}
for (int j = 0; j < tokens.length; j++) {
if (endMentions.containsKey(j)) {
for (Mention m : endMentions.get(j)) {
int id = (gold) ? m.goldCorefClusterID : m.corefClusterID;
id = (printClusterID) ? id : m.mentionID;
sentStr.append("]_").append(id);
}
}
for (int k = 0; k < startCounts.getCount(j); k++) {
if (sentStr.length() > 0 && sentStr.charAt(sentStr.length() - 1) != '[')
sentStr.append(" ");
sentStr.append("[");
}
if (sentStr.length() > 0 && sentStr.charAt(sentStr.length() - 1) != '[')
sentStr.append(" ");
sentStr.append(tokens[j]);
}
if (endMentions.containsKey(tokens.length)) {
for (Mention m : endMentions.get(tokens.length)) {
int id = (gold) ? m.goldCorefClusterID : m.corefClusterID;
id = (printClusterID) ? id : m.mentionID;
//append("_").append(m.mentionID);
sentStr.append("]_").append(id);
}
}
return sentStr.toString();
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class HybridCorefMentionFinder method extractNPorPRP.
private static void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
tree.indexLeaves();
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
TregexPattern tgrepPattern = npOrPrpMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
while (matcher.find()) {
Tree t = matcher.getMatch();
List<Tree> mLeaves = t.getLeaves();
int beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
int endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
// try not to have span that ends with ,
if (",".equals(sent.get(endIdx - 1).word())) {
endIdx--;
}
IntPair mSpan = new IntPair(beginIdx, endIdx);
// if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet)) ) {
if (!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet) || t.value().startsWith("PRP"))) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t);
mentions.add(m);
mentionSpanSet.add(mSpan);
if (m.originalSpan.size() > 1) {
boolean isNE = true;
for (CoreLabel cl : m.originalSpan) {
if (!cl.tag().startsWith("NNP"))
isNE = false;
}
if (isNE) {
namedEntitySpanSet.add(mSpan);
}
}
}
}
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class HybridCorefMentionFinder method extractNamedEntityMentions.
protected static void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
String preNE = "O";
int beginIndex = -1;
for (CoreLabel w : sent) {
String nerString = w.ner();
if (!nerString.equals(preNE)) {
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
if (!preNE.matches("O")) {
if (w.get(CoreAnnotations.TextAnnotation.class).equals("'s") && w.tag().equals("POS")) {
endIndex++;
}
IntPair mSpan = new IntPair(beginIndex, endIndex);
// attached to the previous NER by the earlier heuristic
if (beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
beginIndex = endIndex;
preNE = nerString;
}
}
// NE at the end of sentence
if (!preNE.matches("O")) {
IntPair mSpan = new IntPair(beginIndex, sent.size());
if (!mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, sent.size())));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class HybridCorefMentionFinder method findMentions.
@Override
public List<List<Mention>> findMentions(Annotation doc, Dictionaries dict, Properties props) {
List<List<Mention>> predictedMentions = new ArrayList<>();
Set<String> neStrings = Generics.newHashSet();
List<Set<IntPair>> mentionSpanSetList = Generics.newArrayList();
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
// extract premarked mentions, NP/PRP, named entity, enumerations
for (CoreMap s : sentences) {
List<Mention> mentions = new ArrayList<>();
predictedMentions.add(mentions);
Set<IntPair> mentionSpanSet = Generics.newHashSet();
Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet);
addNamedEntityStrings(s, neStrings, namedEntitySpanSet);
mentionSpanSetList.add(mentionSpanSet);
}
extractNamedEntityModifiers(sentences, mentionSpanSetList, predictedMentions, neStrings);
// find head
for (int i = 0; i < sentences.size(); i++) {
findHead(sentences.get(i), predictedMentions.get(i));
}
// mention selection based on document-wise info
removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang);
// if this is for MD training, skip classification
if (!CorefProperties.isMentionDetectionTraining(props)) {
mdClassifier.classifyMentions(predictedMentions, dict, props);
}
return predictedMentions;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class HybridCorefSystem method postProcessing.
/** Remove singletons, appositive, predicate nominatives, relative pronouns. */
private static void postProcessing(Document document) {
Set<Mention> removeSet = Generics.newHashSet();
Set<Integer> removeClusterSet = Generics.newHashSet();
for (CorefCluster c : document.corefClusters.values()) {
Set<Mention> removeMentions = Generics.newHashSet();
for (Mention m : c.getCorefMentions()) {
if (HybridCorefProperties.REMOVE_APPOSITION_PREDICATENOMINATIVES && ((m.appositions != null && m.appositions.size() > 0) || (m.predicateNominatives != null && m.predicateNominatives.size() > 0) || (m.relativePronouns != null && m.relativePronouns.size() > 0))) {
removeMentions.add(m);
removeSet.add(m);
m.corefClusterID = m.mentionID;
}
}
c.corefMentions.removeAll(removeMentions);
if (HybridCorefProperties.REMOVE_SINGLETONS && c.getCorefMentions().size() == 1) {
removeClusterSet.add(c.clusterID);
}
}
for (int removeId : removeClusterSet) {
document.corefClusters.remove(removeId);
}
for (Mention m : removeSet) {
document.positions.remove(m);
}
}
Aggregations