use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class RuleBasedCorefMentionFinder method filterPredictedMentions.
/** When mention boundaries are given */
public List<List<Mention>> filterPredictedMentions(List<List<Mention>> allGoldMentions, Annotation doc, Dictionaries dict, Properties props) {
List<List<Mention>> predictedMentions = new ArrayList<>();
for (int i = 0; i < allGoldMentions.size(); i++) {
CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
List<Mention> goldMentions = allGoldMentions.get(i);
List<Mention> mentions = new ArrayList<>();
predictedMentions.add(mentions);
mentions.addAll(goldMentions);
findHead(s, mentions);
// todo [cdm 2013]: This block seems to do nothing - the two sets are never used
Set<IntPair> mentionSpanSet = Generics.newHashSet();
Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
for (Mention m : mentions) {
mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex));
if (!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) {
namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex));
}
}
setBarePlural(mentions);
}
removeSpuriousMentions(doc, predictedMentions, dict, CorefProperties.removeNestedMentions(props), lang);
return predictedMentions;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class SingletonPredictor method generateFeatureVectors.
/**
* Generate the training features from the CoNLL input file.
* @return Dataset of feature vectors
* @throws Exception
*/
private static GeneralDataset<String, String> generateFeatureVectors(Properties props) throws Exception {
GeneralDataset<String, String> dataset = new Dataset<>();
Dictionaries dict = new Dictionaries(props);
DocumentMaker docMaker = new DocumentMaker(props, dict);
Document document;
while ((document = docMaker.nextDoc()) != null) {
setTokenIndices(document);
Map<Integer, CorefCluster> entities = document.goldCorefClusters;
// Generate features for coreferent mentions with class label 1
for (CorefCluster entity : entities.values()) {
for (Mention mention : entity.getCorefMentions()) {
// Ignore verbal mentions
if (mention.headWord.tag().startsWith("V"))
continue;
IndexedWord head = mention.enhancedDependency.getNodeByIndexSafe(mention.headWord.index());
if (head == null)
continue;
ArrayList<String> feats = mention.getSingletonFeatures(dict);
dataset.add(new BasicDatum<>(feats, "1"));
}
}
// Generate features for singletons with class label 0
ArrayList<CoreLabel> gold_heads = new ArrayList<>();
for (Mention gold_men : document.goldMentionsByID.values()) {
gold_heads.add(gold_men.headWord);
}
for (Mention predicted_men : document.predictedMentionsByID.values()) {
SemanticGraph dep = predicted_men.enhancedDependency;
IndexedWord head = dep.getNodeByIndexSafe(predicted_men.headWord.index());
if (head == null || !dep.vertexSet().contains(head))
continue;
// Ignore verbal mentions
if (predicted_men.headWord.tag().startsWith("V"))
continue;
// If the mention is in the gold set, it is not a singleton and thus ignore
if (gold_heads.contains(predicted_men.headWord))
continue;
dataset.add(new BasicDatum<>(predicted_men.getSingletonFeatures(dict), "0"));
}
}
dataset.summaryStatistics();
return dataset;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class CategoricalFeatureExtractor method getPairFeatures.
public SimpleMatrix getPairFeatures(Pair<Integer, Integer> pair, Document document, Map<Integer, List<Mention>> mentionsByHeadIndex) {
Mention m1 = document.predictedMentionsByID.get(pair.first);
Mention m2 = document.predictedMentionsByID.get(pair.second);
List<Integer> featureVals = pairwiseFeatures(document, m1, m2, dictionaries, conll);
SimpleMatrix features = new SimpleMatrix(featureVals.size(), 1);
for (int i = 0; i < featureVals.size(); i++) {
features.set(i, featureVals.get(i));
}
features = NeuralUtils.concatenate(features, encodeDistance(m2.sentNum - m1.sentNum), encodeDistance(m2.mentionNum - m1.mentionNum - 1), new SimpleMatrix(new double[][] { { m1.sentNum == m2.sentNum && m1.endIndex > m2.startIndex ? 1 : 0 } }), getMentionFeatures(m1, document, mentionsByHeadIndex), getMentionFeatures(m2, document, mentionsByHeadIndex), encodeGenre(document));
return features;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class EmbeddingExtractor method getDocumentEmbedding.
public SimpleMatrix getDocumentEmbedding(Document document) {
if (!conll) {
return new SimpleMatrix(staticWordEmbeddings.getEmbeddingSize(), 1);
}
List<CoreLabel> words = new ArrayList<>();
Set<Integer> seenSentences = new HashSet<>();
for (Mention m : document.predictedMentionsByID.values()) {
if (!seenSentences.contains(m.sentNum)) {
seenSentences.add(m.sentNum);
words.addAll(m.sentenceWords);
}
}
return getAverageEmbedding(words);
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class NeuralCorefAlgorithm method runCoref.
@Override
public void runCoref(Document document) {
List<Mention> sortedMentions = CorefUtils.getSortedMentions(document);
Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
for (Mention m : sortedMentions) {
List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
if (withIndex == null) {
withIndex = new ArrayList<>();
mentionsByHeadIndex.put(m.headIndex, withIndex);
}
withIndex.add(m);
}
SimpleMatrix documentEmbedding = embeddingExtractor.getDocumentEmbedding(document);
Map<Integer, SimpleMatrix> antecedentEmbeddings = new HashMap<>();
Map<Integer, SimpleMatrix> anaphorEmbeddings = new HashMap<>();
Counter<Integer> anaphoricityScores = new ClassicCounter<>();
for (Mention m : sortedMentions) {
SimpleMatrix mentionEmbedding = embeddingExtractor.getMentionEmbeddings(m, documentEmbedding);
antecedentEmbeddings.put(m.mentionID, model.getAntecedentEmbedding(mentionEmbedding));
anaphorEmbeddings.put(m.mentionID, model.getAnaphorEmbedding(mentionEmbedding));
anaphoricityScores.incrementCount(m.mentionID, model.getAnaphoricityScore(mentionEmbedding, featureExtractor.getAnaphoricityFeatures(m, document, mentionsByHeadIndex)));
}
Map<Integer, List<Integer>> mentionToCandidateAntecedents = CorefUtils.heuristicFilter(sortedMentions, maxMentionDistance, maxMentionDistanceWithStringMatch);
for (Map.Entry<Integer, List<Integer>> e : mentionToCandidateAntecedents.entrySet()) {
double bestScore = anaphoricityScores.getCount(e.getKey()) - 50 * (greedyness - 0.5);
int m = e.getKey();
Integer antecedent = null;
for (int ca : e.getValue()) {
double score = model.getPairwiseScore(antecedentEmbeddings.get(ca), anaphorEmbeddings.get(m), featureExtractor.getPairFeatures(new Pair<>(ca, m), document, mentionsByHeadIndex));
if (score > bestScore) {
bestScore = score;
antecedent = ca;
}
}
if (antecedent != null) {
CorefUtils.mergeCoreferenceClusters(new Pair<>(antecedent, m), document);
}
}
}
Aggregations