use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method toProto.
public CoreNLPProtos.Mention toProto(Mention mention) {
// create the builder
CoreNLPProtos.Mention.Builder builder = CoreNLPProtos.Mention.newBuilder();
// set enums
if (mention.mentionType != null) {
builder.setMentionType(mention.mentionType.name());
}
if (mention.gender != null) {
builder.setGender(mention.gender.name());
}
if (mention.number != null) {
builder.setNumber(mention.number.name());
}
if (mention.animacy != null) {
builder.setAnimacy(mention.animacy.name());
}
if (mention.person != null) {
builder.setPerson(mention.person.name());
}
if (mention.headString != null) {
builder.setHeadString(mention.headString);
}
if (mention.nerString != null) {
builder.setNerString(mention.nerString);
}
builder.setStartIndex(mention.startIndex);
builder.setEndIndex(mention.endIndex);
builder.setHeadIndex(mention.headIndex);
builder.setMentionID(mention.mentionID);
builder.setOriginalRef(mention.originalRef);
builder.setGoldCorefClusterID(mention.goldCorefClusterID);
builder.setCorefClusterID(mention.corefClusterID);
builder.setMentionNum(mention.mentionNum);
builder.setSentNum(mention.sentNum);
builder.setUtter(mention.utter);
builder.setParagraph(mention.paragraph);
builder.setIsSubject(mention.isSubject);
builder.setIsDirectObject(mention.isDirectObject);
builder.setIsIndirectObject(mention.isIndirectObject);
builder.setIsPrepositionObject(mention.isPrepositionObject);
builder.setHasTwin(mention.hasTwin);
builder.setGeneric(mention.generic);
builder.setIsSingleton(mention.isSingleton);
// handle the two sets of Strings
if (mention.dependents != null) {
mention.dependents.forEach(builder::addDependents);
}
if (mention.preprocessedTerms != null) {
mention.preprocessedTerms.forEach(builder::addPreprocessedTerms);
}
// set IndexedWords by storing (sentence number, token index) pairs
builder.setDependingVerb(createIndexedWordProtoFromIW(mention.dependingVerb));
builder.setHeadIndexedWord(createIndexedWordProtoFromIW(mention.headIndexedWord));
builder.setHeadWord(createIndexedWordProtoFromCL(mention.headWord));
// add positions for each CoreLabel in sentence
if (mention.sentenceWords != null) {
for (CoreLabel cl : mention.sentenceWords) {
builder.addSentenceWords(createIndexedWordProtoFromCL(cl));
}
}
if (mention.originalSpan != null) {
for (CoreLabel cl : mention.originalSpan) {
builder.addOriginalSpan(createIndexedWordProtoFromCL(cl));
}
}
// flag if this Mention should get basicDependency, collapsedDependency, and contextParseTree or not
builder.setHasBasicDependency((mention.basicDependency != null));
builder.setHasEnhancedDepenedncy((mention.enhancedDependency != null));
builder.setHasContextParseTree((mention.contextParseTree != null));
// handle the sets of Mentions, just store mentionID
if (mention.appositions != null) {
for (Mention m : mention.appositions) {
builder.addAppositions(m.mentionID);
}
}
if (mention.predicateNominatives != null) {
for (Mention m : mention.predicateNominatives) {
builder.addPredicateNominatives(m.mentionID);
}
}
if (mention.relativePronouns != null) {
for (Mention m : mention.relativePronouns) {
builder.addRelativePronouns(m.mentionID);
}
}
if (mention.listMembers != null) {
for (Mention m : mention.listMembers) {
builder.addListMembers(m.mentionID);
}
}
if (mention.belongToLists != null) {
for (Mention m : mention.belongToLists) {
builder.addBelongToLists(m.mentionID);
}
}
if (mention.speakerInfo != null) {
builder.setSpeakerInfo(toProto(mention.speakerInfo));
}
return builder.build();
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method fromProto.
/**
* Returns a complete document, intended to mimic a document passes as input to
* {@link ProtobufAnnotationSerializer#toProto(Annotation)} as closely as possible.
* That is, most common fields are serialized, but there is not guarantee that custom additions
* will be saved and retrieved.
*
* @param proto The protocol buffer to read the document from.
* @return An Annotation corresponding to the read protobuf.
*/
@SuppressWarnings("deprecation")
public Annotation fromProto(CoreNLPProtos.Document proto) {
if (Thread.interrupted()) {
throw new RuntimeInterruptedException();
}
// Set text
Annotation ann = new Annotation(proto.getText());
// if there are characters, add characters
if (proto.getCharacterCount() > 0) {
List<CoreLabel> docChars = new ArrayList<CoreLabel>();
for (CoreNLPProtos.Token c : proto.getCharacterList()) {
docChars.add(fromProto(c));
}
ann.set(SegmenterCoreAnnotations.CharactersAnnotation.class, docChars);
}
// Add tokens
List<CoreLabel> tokens = new ArrayList<>();
if (proto.getSentenceCount() > 0) {
// Populate the tokens from the sentence
for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) {
// It's conceivable that the sentences are not contiguous -- pad this with nulls
while (sentence.hasTokenOffsetBegin() && tokens.size() < sentence.getTokenOffsetBegin()) {
tokens.add(null);
}
// Read the sentence
for (CoreNLPProtos.Token token : sentence.getTokenList()) {
CoreLabel coreLabel = fromProto(token);
// Set docid
if (proto.hasDocID()) {
coreLabel.setDocID(proto.getDocID());
}
if (token.hasTokenBeginIndex() && token.hasTokenEndIndex()) {
// This is usually true, if enough annotators are defined
while (tokens.size() < sentence.getTokenOffsetEnd()) {
tokens.add(null);
}
for (int i = token.getTokenBeginIndex(); i < token.getTokenEndIndex(); ++i) {
tokens.set(token.getTokenBeginIndex(), coreLabel);
}
} else {
// Assume this token spans a single token, and just add it to the tokens list
tokens.add(coreLabel);
}
}
}
} else if (proto.getSentencelessTokenCount() > 0) {
// Eek -- no sentences. Try to recover tokens directly
if (proto.getSentencelessTokenCount() > 0) {
for (CoreNLPProtos.Token token : proto.getSentencelessTokenList()) {
CoreLabel coreLabel = fromProto(token);
// Set docid
if (proto.hasDocID()) {
coreLabel.setDocID(proto.getDocID());
}
tokens.add(coreLabel);
}
}
}
if (!tokens.isEmpty()) {
ann.set(TokensAnnotation.class, tokens);
}
// Add sentences
List<CoreMap> sentences = new ArrayList<>(proto.getSentenceCount());
for (int sentIndex = 0; sentIndex < proto.getSentenceCount(); ++sentIndex) {
CoreNLPProtos.Sentence sentence = proto.getSentence(sentIndex);
CoreMap map = fromProtoNoTokens(sentence);
if (!tokens.isEmpty() && sentence.hasTokenOffsetBegin() && sentence.hasTokenOffsetEnd() && map.get(TokensAnnotation.class) == null) {
// Set tokens for sentence
int tokenBegin = sentence.getTokenOffsetBegin();
int tokenEnd = sentence.getTokenOffsetEnd();
assert tokenBegin <= tokens.size() && tokenBegin <= tokenEnd;
assert tokenEnd <= tokens.size();
map.set(TokensAnnotation.class, tokens.subList(tokenBegin, tokenEnd));
// Set sentence index + token index + paragraph index
for (int i = tokenBegin; i < tokenEnd; ++i) {
tokens.get(i).setSentIndex(sentIndex);
tokens.get(i).setIndex(i - sentence.getTokenOffsetBegin() + 1);
if (sentence.hasParagraph()) {
tokens.get(i).set(ParagraphAnnotation.class, sentence.getParagraph());
}
}
// Set text
int characterBegin = sentence.getCharacterOffsetBegin();
int characterEnd = sentence.getCharacterOffsetEnd();
if (characterEnd <= proto.getText().length()) {
// The usual case -- get the text from the document text
map.set(TextAnnotation.class, proto.getText().substring(characterBegin, characterEnd));
} else {
// The document text is wrong -- guess the text from the tokens
map.set(TextAnnotation.class, recoverOriginalText(tokens.subList(tokenBegin, tokenEnd), sentence));
}
}
// End iteration
sentences.add(map);
}
if (!sentences.isEmpty()) {
ann.set(SentencesAnnotation.class, sentences);
}
// Set DocID
String docid = null;
if (proto.hasDocID()) {
docid = proto.getDocID();
ann.set(DocIDAnnotation.class, docid);
}
// Set reference time
if (proto.hasDocDate()) {
ann.set(DocDateAnnotation.class, proto.getDocDate());
}
if (proto.hasCalendar()) {
GregorianCalendar calendar = new GregorianCalendar();
calendar.setTimeInMillis(proto.getCalendar());
ann.set(CalendarAnnotation.class, calendar);
}
// Set coref chain
Map<Integer, CorefChain> corefChains = new HashMap<>();
for (CoreNLPProtos.CorefChain chainProto : proto.getCorefChainList()) {
CorefChain chain = fromProto(chainProto, ann);
corefChains.put(chain.getChainID(), chain);
}
if (!corefChains.isEmpty()) {
ann.set(CorefChainAnnotation.class, corefChains);
}
// hashes to access Mentions , later in this method need to add speakerInfo to Mention
// so we need to create id -> Mention, CoreNLPProtos.Mention maps to do this, since SpeakerInfo could reference
// any Mention in doc
HashMap<Integer, Mention> idToMention = new HashMap<>();
HashMap<Integer, CoreNLPProtos.Mention> idToProtoMention = new HashMap<>();
// Set things in the sentence that need a document context.
for (int sentenceIndex = 0; sentenceIndex < proto.getSentenceCount(); ++sentenceIndex) {
CoreNLPProtos.Sentence sentence = proto.getSentenceList().get(sentenceIndex);
CoreMap map = sentences.get(sentenceIndex);
List<CoreLabel> sentenceTokens = map.get(TokensAnnotation.class);
// Set dependency graphs
if (sentence.hasBasicDependencies()) {
map.set(BasicDependenciesAnnotation.class, fromProto(sentence.getBasicDependencies(), sentenceTokens, docid));
}
if (sentence.hasCollapsedDependencies()) {
map.set(CollapsedDependenciesAnnotation.class, fromProto(sentence.getCollapsedDependencies(), sentenceTokens, docid));
}
if (sentence.hasCollapsedCCProcessedDependencies()) {
map.set(CollapsedCCProcessedDependenciesAnnotation.class, fromProto(sentence.getCollapsedCCProcessedDependencies(), sentenceTokens, docid));
}
if (sentence.hasAlternativeDependencies()) {
map.set(AlternativeDependenciesAnnotation.class, fromProto(sentence.getAlternativeDependencies(), sentenceTokens, docid));
}
if (sentence.hasEnhancedDependencies()) {
map.set(EnhancedDependenciesAnnotation.class, fromProto(sentence.getEnhancedDependencies(), sentenceTokens, docid));
}
if (sentence.hasEnhancedPlusPlusDependencies()) {
map.set(EnhancedPlusPlusDependenciesAnnotation.class, fromProto(sentence.getEnhancedPlusPlusDependencies(), sentenceTokens, docid));
}
// Set entailed sentences
if (sentence.getEntailedSentenceCount() > 0) {
Set<SentenceFragment> entailedSentences = sentence.getEntailedSentenceList().stream().map(frag -> fromProto(frag, map.get(EnhancedPlusPlusDependenciesAnnotation.class))).collect(Collectors.toSet());
map.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, entailedSentences);
}
if (sentence.getEntailedClauseCount() > 0) {
Set<SentenceFragment> entailedClauses = sentence.getEntailedClauseList().stream().map(frag -> fromProto(frag, map.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toSet());
map.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, entailedClauses);
}
// Set relation triples
if (sentence.getOpenieTripleCount() > 0) {
List<RelationTriple> triples = new ArrayList<>();
for (CoreNLPProtos.RelationTriple triple : sentence.getOpenieTripleList()) {
triples.add(fromProto(triple, ann, sentenceIndex));
}
map.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, triples);
}
// Redo some light annotation
if (map.containsKey(TokensAnnotation.class) && (!sentence.hasHasNumerizedTokensAnnotation() || sentence.getHasNumerizedTokensAnnotation())) {
map.set(NumerizedTokensAnnotation.class, NumberNormalizer.findAndMergeNumbers(map));
}
// add the CoreLabel and IndexedWord info to each mention
// when Mentions are serialized, just storing the index in the sentence for CoreLabels and IndexedWords
// this is the point where the de-serialized sentence has tokens
int mentionInt = 0;
for (CoreNLPProtos.Mention protoMention : sentence.getMentionsForCorefList()) {
// get the mention
Mention mentionToUpdate = map.get(CorefMentionsAnnotation.class).get(mentionInt);
// store these in hash for more processing later in this method
idToMention.put(mentionToUpdate.mentionID, mentionToUpdate);
idToProtoMention.put(mentionToUpdate.mentionID, protoMention);
// update the values
int headIndexedWordIndex = protoMention.getHeadIndexedWord().getTokenIndex();
if (headIndexedWordIndex >= 0) {
mentionToUpdate.headIndexedWord = new IndexedWord(sentenceTokens.get(protoMention.getHeadIndexedWord().getTokenIndex()));
mentionToUpdate.headIndexedWord.setCopyCount(protoMention.getHeadIndexedWord().getCopyCount());
}
int dependingVerbIndex = protoMention.getDependingVerb().getTokenIndex();
if (dependingVerbIndex >= 0) {
mentionToUpdate.dependingVerb = new IndexedWord(sentenceTokens.get(protoMention.getDependingVerb().getTokenIndex()));
mentionToUpdate.dependingVerb.setCopyCount(protoMention.getDependingVerb().getCopyCount());
}
int headWordIndex = protoMention.getHeadWord().getTokenIndex();
if (headWordIndex >= 0) {
mentionToUpdate.headWord = sentenceTokens.get(protoMention.getHeadWord().getTokenIndex());
}
mentionToUpdate.sentenceWords = new ArrayList<>();
for (CoreNLPProtos.IndexedWord clp : protoMention.getSentenceWordsList()) {
int ti = clp.getTokenIndex();
mentionToUpdate.sentenceWords.add(sentenceTokens.get(ti));
}
mentionToUpdate.originalSpan = new ArrayList<>();
for (CoreNLPProtos.IndexedWord clp : protoMention.getOriginalSpanList()) {
int ti = clp.getTokenIndex();
mentionToUpdate.originalSpan.add(sentenceTokens.get(ti));
}
if (protoMention.getHasBasicDependency()) {
mentionToUpdate.basicDependency = map.get(BasicDependenciesAnnotation.class);
}
if (protoMention.getHasEnhancedDepenedncy()) {
mentionToUpdate.enhancedDependency = map.get(EnhancedDependenciesAnnotation.class);
}
if (protoMention.getHasContextParseTree()) {
mentionToUpdate.contextParseTree = map.get(TreeAnnotation.class);
}
// move on to next mention
mentionInt++;
}
}
// Set quotes
List<CoreMap> quotes = proto.getQuoteList().stream().map(quote -> fromProto(quote, tokens)).collect(Collectors.toList());
if (!quotes.isEmpty()) {
ann.set(QuotationsAnnotation.class, quotes);
}
// Set NERmention
List<CoreMap> mentions = proto.getMentionsList().stream().map(this::fromProto).collect(Collectors.toList());
if (!mentions.isEmpty()) {
ann.set(MentionsAnnotation.class, mentions);
}
// also add all the Set<Mention>
for (int mentionID : idToMention.keySet()) {
// this is the Mention message corresponding to this Mention
Mention mentionToUpdate = idToMention.get(mentionID);
CoreNLPProtos.Mention correspondingProtoMention = idToProtoMention.get(mentionID);
if (!correspondingProtoMention.hasSpeakerInfo()) {
// so just continue to next Mention
continue;
}
// if we're here we know a speakerInfo was stored
SpeakerInfo speakerInfo = fromProto(correspondingProtoMention.getSpeakerInfo());
// MentionID is ID in document, 0, 1, 2, etc...
for (int speakerInfoMentionID : correspondingProtoMention.getSpeakerInfo().getMentionsList()) {
speakerInfo.addMention(idToMention.get(speakerInfoMentionID));
}
// now the SpeakerInfo for this Mention should be fully restored
mentionToUpdate.speakerInfo = speakerInfo;
}
// Return
return ann;
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class NeuralCorefDataExporter method process.
@Override
public void process(int id, Document document) {
JsonArrayBuilder clusters = Json.createArrayBuilder();
for (CorefCluster gold : document.goldCorefClusters.values()) {
JsonArrayBuilder c = Json.createArrayBuilder();
for (Mention m : gold.corefMentions) {
c.add(m.mentionID);
}
clusters.add(c.build());
}
goldClusterWriter.println(Json.createObjectBuilder().add(String.valueOf(id), clusters.build()).build());
Map<Pair<Integer, Integer>, Boolean> mentionPairs = CorefUtils.getLabeledMentionPairs(document);
List<Mention> mentionsList = CorefUtils.getSortedMentions(document);
Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
for (int i = 0; i < mentionsList.size(); i++) {
Mention m = mentionsList.get(i);
List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
if (withIndex == null) {
withIndex = new ArrayList<>();
mentionsByHeadIndex.put(m.headIndex, withIndex);
}
withIndex.add(m);
}
JsonObjectBuilder docFeatures = Json.createObjectBuilder();
docFeatures.add("doc_id", id);
docFeatures.add("type", document.docType == DocType.ARTICLE ? 1 : 0);
docFeatures.add("source", document.docInfo.get("DOC_ID").split("/")[0]);
JsonArrayBuilder sentences = Json.createArrayBuilder();
for (CoreMap sentence : document.annotation.get(SentencesAnnotation.class)) {
sentences.add(getSentenceArray(sentence.get(CoreAnnotations.TokensAnnotation.class)));
}
JsonObjectBuilder mentions = Json.createObjectBuilder();
for (Mention m : document.predictedMentionsByID.values()) {
Iterator<SemanticGraphEdge> iterator = m.enhancedDependency.incomingEdgeIterator(m.headIndexedWord);
SemanticGraphEdge relation = iterator.hasNext() ? iterator.next() : null;
String depRelation = relation == null ? "no-parent" : relation.getRelation().toString();
String depParent = relation == null ? "<missing>" : relation.getSource().word();
mentions.add(String.valueOf(m.mentionNum), Json.createObjectBuilder().add("doc_id", id).add("mention_id", m.mentionID).add("mention_num", m.mentionNum).add("sent_num", m.sentNum).add("start_index", m.startIndex).add("end_index", m.endIndex).add("head_index", m.headIndex).add("mention_type", m.mentionType.toString()).add("dep_relation", depRelation).add("dep_parent", depParent).add("sentence", getSentenceArray(m.sentenceWords)).add("contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)) ? 1 : 0).build());
}
JsonArrayBuilder featureNames = Json.createArrayBuilder().add("same-speaker").add("antecedent-is-mention-speaker").add("mention-is-antecedent-speaker").add("relaxed-head-match").add("exact-string-match").add("relaxed-string-match");
JsonObjectBuilder features = Json.createObjectBuilder();
JsonObjectBuilder labels = Json.createObjectBuilder();
for (Map.Entry<Pair<Integer, Integer>, Boolean> e : mentionPairs.entrySet()) {
Mention m1 = document.predictedMentionsByID.get(e.getKey().first);
Mention m2 = document.predictedMentionsByID.get(e.getKey().second);
String key = m1.mentionNum + " " + m2.mentionNum;
JsonArrayBuilder builder = Json.createArrayBuilder();
for (int val : CategoricalFeatureExtractor.pairwiseFeatures(document, m1, m2, dictionaries, conll)) {
builder.add(val);
}
features.add(key, builder.build());
labels.add(key, e.getValue() ? 1 : 0);
}
JsonObject docData = Json.createObjectBuilder().add("sentences", sentences.build()).add("mentions", mentions.build()).add("labels", labels.build()).add("pair_feature_names", featureNames.build()).add("pair_features", features.build()).add("document_features", docFeatures.build()).build();
dataWriter.println(docData);
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class FeatureExtractor method extract.
public DocumentExamples extract(int id, Document document, Map<Pair<Integer, Integer>, Boolean> labeledPairs, Compressor<String> compressor) {
List<Mention> mentionsList = CorefUtils.getSortedMentions(document);
Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
for (Mention m : mentionsList) {
List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
if (withIndex == null) {
withIndex = new ArrayList<>();
mentionsByHeadIndex.put(m.headIndex, withIndex);
}
withIndex.add(m);
}
Map<Integer, Mention> mentions = document.predictedMentionsByID;
List<Example> examples = new ArrayList<>();
Set<Integer> mentionsToExtract = new HashSet<>();
for (Map.Entry<Pair<Integer, Integer>, Boolean> pair : labeledPairs.entrySet()) {
Mention m1 = mentions.get(pair.getKey().first);
Mention m2 = mentions.get(pair.getKey().second);
mentionsToExtract.add(m1.mentionID);
mentionsToExtract.add(m2.mentionID);
CompressedFeatureVector features = compressor.compress(getFeatures(document, m1, m2));
examples.add(new Example(id, m1, m2, pair.getValue() ? 1.0 : 0.0, features));
}
Map<Integer, CompressedFeatureVector> mentionFeatures = new HashMap<>();
for (int mentionID : mentionsToExtract) {
mentionFeatures.put(mentionID, compressor.compress(getFeatures(document, document.predictedMentionsByID.get(mentionID), mentionsByHeadIndex)));
}
return new DocumentExamples(id, examples, mentionFeatures);
}
use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.
the class MetadataWriter method process.
@Override
public void process(int id, Document document) {
// Mention types
mentionTypes.put(id, document.predictedMentionsByID.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString())));
// Gold clusters
List<List<Integer>> clusters = new ArrayList<>();
for (CorefCluster c : document.goldCorefClusters.values()) {
List<Integer> cluster = new ArrayList<>();
for (Mention m : c.getCorefMentions()) {
cluster.add(m.mentionID);
}
clusters.add(cluster);
}
goldClusters.put(id, clusters);
// Word counting
if (countWords && mentionPairs.containsKey(id)) {
Set<Pair<Integer, Integer>> pairs = mentionPairs.get(id).keySet();
Set<Integer> mentions = new HashSet<>();
for (Pair<Integer, Integer> pair : pairs) {
mentions.add(pair.first);
mentions.add(pair.second);
Mention m1 = document.predictedMentionsByID.get(pair.first);
Mention m2 = document.predictedMentionsByID.get(pair.second);
wordCounts.incrementCount("h_" + m1.headWord.word().toLowerCase() + "_" + m2.headWord.word().toLowerCase());
}
Map<Integer, List<CoreLabel>> sentences = new HashMap<>();
for (int mention : mentions) {
Mention m = document.predictedMentionsByID.get(mention);
if (!sentences.containsKey(m.sentNum)) {
sentences.put(m.sentNum, m.sentenceWords);
}
}
for (List<CoreLabel> sentence : sentences.values()) {
for (int i = 0; i < sentence.size(); i++) {
CoreLabel cl = sentence.get(i);
if (cl == null) {
continue;
}
String w = cl.word().toLowerCase();
wordCounts.incrementCount(w);
if (i > 0) {
CoreLabel clp = sentence.get(i - 1);
if (clp == null) {
continue;
}
String wp = clp.word().toLowerCase();
wordCounts.incrementCount(wp + "_" + w);
}
}
}
}
}
Aggregations