use of edu.stanford.nlp.coref.data.CorefChain in project CoreNLP by stanfordnlp.
the class CustomAnnotationSerializer method loadCorefChains.
/**
* Loads the CorefChain objects from the serialized buffer
* @param reader the buffer
* @return A map from cluster id to clusters
* @throws IOException
*/
private static Map<Integer, CorefChain> loadCorefChains(BufferedReader reader) throws IOException {
String line = reader.readLine().trim();
if (line.isEmpty())
return null;
int clusterCount = Integer.valueOf(line);
Map<Integer, CorefChain> chains = Generics.newHashMap();
// read each cluster
for (int c = 0; c < clusterCount; c++) {
line = reader.readLine().trim();
String[] bits = line.split("\\s");
int cid = Integer.valueOf(bits[0]);
int mentionCount = Integer.valueOf(bits[1]);
Map<IntPair, Set<CorefChain.CorefMention>> mentionMap = Generics.newHashMap();
CorefChain.CorefMention representative = null;
// read each mention in this cluster
for (int m = 0; m < mentionCount; m++) {
line = reader.readLine();
bits = line.split("\\s");
IntPair key = new IntPair(Integer.valueOf(bits[0]), Integer.valueOf(bits[1]));
boolean rep = bits[2].equals("1");
Dictionaries.MentionType mentionType = parseMentionType(bits[3]);
Dictionaries.Number number = parseNumber(bits[4]);
Dictionaries.Gender gender = parseGender(bits[5]);
Dictionaries.Animacy animacy = parseAnimacy(bits[6]);
int startIndex = Integer.valueOf(bits[7]);
int endIndex = Integer.valueOf(bits[8]);
int headIndex = Integer.valueOf(bits[9]);
int clusterID = Integer.valueOf(bits[10]);
int mentionID = Integer.valueOf(bits[11]);
int sentNum = Integer.valueOf(bits[12]);
int posLen = Integer.valueOf(bits[13]);
int[] posElems = new int[posLen];
for (int i = 0; i < posLen; i++) {
posElems[i] = Integer.valueOf(bits[14 + i]);
}
IntTuple position = new IntTuple(posElems);
String span = unescapeSpace(bits[14 + posLen]);
CorefChain.CorefMention mention = new CorefChain.CorefMention(mentionType, number, gender, animacy, startIndex, endIndex, headIndex, clusterID, mentionID, sentNum, position, span);
Set<CorefChain.CorefMention> mentionsWithThisHead = mentionMap.get(key);
if (mentionsWithThisHead == null) {
mentionsWithThisHead = Generics.newHashSet();
mentionMap.put(key, mentionsWithThisHead);
}
mentionsWithThisHead.add(mention);
if (rep)
representative = mention;
}
// construct the cluster
CorefChain chain = new CorefChain(cid, mentionMap, representative);
chains.put(cid, chain);
}
reader.readLine();
return chains;
}
use of edu.stanford.nlp.coref.data.CorefChain in project CoreNLP by stanfordnlp.
the class CustomAnnotationSerializer method read.
@Override
public Pair<Annotation, InputStream> read(InputStream is) throws IOException {
if (compress && !(is instanceof GZIPInputStream))
is = new GZIPInputStream(is);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
Annotation doc = new Annotation("");
String line;
// read the coref graph (new format)
Map<Integer, CorefChain> chains = loadCorefChains(reader);
if (chains != null)
doc.set(CorefCoreAnnotations.CorefChainAnnotation.class, chains);
// read the coref graph (old format)
line = reader.readLine().trim();
if (line.length() > 0) {
String[] bits = line.split(" ");
if (bits.length % 4 != 0) {
throw new RuntimeIOException("ERROR: Incorrect format for the serialized coref graph: " + line);
}
List<Pair<IntTuple, IntTuple>> corefGraph = new ArrayList<>();
for (int i = 0; i < bits.length; i += 4) {
IntTuple src = new IntTuple(2);
IntTuple dst = new IntTuple(2);
src.set(0, Integer.parseInt(bits[i]));
src.set(1, Integer.parseInt(bits[i + 1]));
dst.set(0, Integer.parseInt(bits[i + 2]));
dst.set(1, Integer.parseInt(bits[i + 3]));
corefGraph.add(new Pair<>(src, dst));
}
doc.set(CorefCoreAnnotations.CorefGraphAnnotation.class, corefGraph);
}
// read individual sentences
List<CoreMap> sentences = new ArrayList<>();
while ((line = reader.readLine()) != null) {
CoreMap sentence = new Annotation("");
// first line is the parse tree. construct it with CoreLabels in Tree nodes
Tree tree = new PennTreeReader(new StringReader(line), new LabeledScoredTreeFactory(CoreLabel.factory())).readTree();
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
// read the dependency graphs
IntermediateSemanticGraph intermCollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermUncollapsedDeps = loadDependencyGraph(reader);
IntermediateSemanticGraph intermCcDeps = loadDependencyGraph(reader);
// the remaining lines until empty line are tokens
List<CoreLabel> tokens = new ArrayList<>();
while ((line = reader.readLine()) != null) {
if (line.length() == 0)
break;
CoreLabel token = loadToken(line, haveExplicitAntecedent);
tokens.add(token);
}
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
// convert the intermediate graph to an actual SemanticGraph
SemanticGraph collapsedDeps = intermCollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, collapsedDeps);
SemanticGraph uncollapsedDeps = intermUncollapsedDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, uncollapsedDeps);
SemanticGraph ccDeps = intermCcDeps.convertIntermediateGraph(tokens);
sentence.set(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class, ccDeps);
sentences.add(sentence);
}
doc.set(CoreAnnotations.SentencesAnnotation.class, sentences);
return Pair.makePair(doc, is);
}
use of edu.stanford.nlp.coref.data.CorefChain in project CoreNLP by stanfordnlp.
the class KBPAnnotator method annotate.
/**
* Annotate this document for KBP relations.
* @param annotation The document to annotate.
*/
@Override
public void annotate(Annotation annotation) {
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
// Annotate with NER
//casedNER.annotate(annotation);
//caselessNER.annotate(annotation);
// Annotate with Mentions
entityMentionAnnotator.annotate(annotation);
// Create simple document
Document doc = new Document(kbpProperties, serializer.toProto(annotation));
// Get the mentions in the document
List<CoreMap> mentions = new ArrayList<>();
for (CoreMap sentence : sentences) {
mentions.addAll(sentence.get(CoreAnnotations.MentionsAnnotation.class));
}
List<CoreMap> pronounMentions = annotatePronominalMentions(annotation);
mentions.addAll(pronounMentions);
// Compute coreferent clusters
// (map an index to a KBP mention)
Map<Pair<Integer, Integer>, CoreMap> mentionByStartIndex = new HashMap<>();
for (CoreMap mention : mentions) {
for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
mentionByStartIndex.put(Pair.makePair(token.sentIndex(), token.index()), mention);
}
}
// (collect coreferent KBP mentions)
// map from canonical mention -> other mentions
Map<CoreMap, Set<CoreMap>> mentionsMap = new HashMap<>();
if (annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) {
for (Map.Entry<Integer, CorefChain> chain : annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class).entrySet()) {
CoreMap firstMention = null;
for (CorefChain.CorefMention mention : chain.getValue().getMentionsInTextualOrder()) {
CoreMap kbpMention = null;
for (int i = mention.startIndex; i < mention.endIndex; ++i) {
if (mentionByStartIndex.containsKey(Pair.makePair(mention.sentNum - 1, i))) {
kbpMention = mentionByStartIndex.get(Pair.makePair(mention.sentNum - 1, i));
break;
}
}
if (firstMention == null) {
firstMention = kbpMention;
}
if (kbpMention != null) {
if (!mentionsMap.containsKey(firstMention)) {
mentionsMap.put(firstMention, new LinkedHashSet<>());
}
mentionsMap.get(firstMention).add(kbpMention);
}
}
}
}
// (coreference acronyms)
acronymMatch(mentions, mentionsMap);
// (ensure valid NER tag for canonical mention)
for (CoreMap key : new HashSet<>(mentionsMap.keySet())) {
if (key.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null) {
CoreMap newKey = null;
for (CoreMap candidate : mentionsMap.get(key)) {
if (candidate.get(CoreAnnotations.NamedEntityTagAnnotation.class) != null) {
newKey = candidate;
break;
}
}
if (newKey != null) {
mentionsMap.put(newKey, mentionsMap.remove(key));
} else {
// case: no mention in this chain has an NER tag.
mentionsMap.remove(key);
}
}
}
// Propagate Entity Link
for (Map.Entry<CoreMap, Set<CoreMap>> entry : mentionsMap.entrySet()) {
String entityLink = entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class);
for (CoreMap mention : entry.getValue()) {
for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
token.set(CoreAnnotations.WikipediaEntityAnnotation.class, entityLink);
}
}
}
// Create a canonical mention map
Map<CoreMap, CoreMap> mentionToCanonicalMention = new HashMap<>();
for (Map.Entry<CoreMap, Set<CoreMap>> entry : mentionsMap.entrySet()) {
for (CoreMap mention : entry.getValue()) {
// (set the NER tag + link to be axiomatically that of the canonical mention)
mention.set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.getKey().get(CoreAnnotations.NamedEntityTagAnnotation.class));
mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class));
// (add the mention (note: this must come after we set the NER!)
mentionToCanonicalMention.put(mention, entry.getKey());
}
}
// (add missing mentions)
mentions.stream().filter(mention -> mentionToCanonicalMention.get(mention) == null).forEach(mention -> mentionToCanonicalMention.put(mention, mention));
// Cluster mentions by sentence
@SuppressWarnings("unchecked") List<CoreMap>[] mentionsBySentence = new List[annotation.get(CoreAnnotations.SentencesAnnotation.class).size()];
for (int i = 0; i < mentionsBySentence.length; ++i) {
mentionsBySentence[i] = new ArrayList<>();
}
for (CoreMap mention : mentionToCanonicalMention.keySet()) {
mentionsBySentence[mention.get(CoreAnnotations.SentenceIndexAnnotation.class)].add(mention);
}
// Classify
for (int sentenceI = 0; sentenceI < mentionsBySentence.length; ++sentenceI) {
// the annotations
List<RelationTriple> triples = new ArrayList<>();
List<CoreMap> candidates = mentionsBySentence[sentenceI];
// determine sentence length
int sentenceLength = annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).get(CoreAnnotations.TokensAnnotation.class).size();
// check if sentence is too long, if it's too long don't run kbp
if (maxLength != -1 && sentenceLength > maxLength) {
// set the triples annotation to an empty list of RelationTriples
annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(CoreAnnotations.KBPTriplesAnnotation.class, triples);
// continue to next sentence
continue;
}
// sentence isn't too long, so continue processing this sentence
for (int subjI = 0; subjI < candidates.size(); ++subjI) {
CoreMap subj = candidates.get(subjI);
int subjBegin = subj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
int subjEnd = subj.get(CoreAnnotations.TokensAnnotation.class).get(subj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
Optional<KBPRelationExtractor.NERTag> subjNER = KBPRelationExtractor.NERTag.fromString(subj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
if (subjNER.isPresent()) {
for (int objI = 0; objI < candidates.size(); ++objI) {
if (subjI == objI) {
continue;
}
if (Thread.interrupted()) {
throw new RuntimeInterruptedException();
}
CoreMap obj = candidates.get(objI);
int objBegin = obj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
int objEnd = obj.get(CoreAnnotations.TokensAnnotation.class).get(obj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
Optional<KBPRelationExtractor.NERTag> objNER = KBPRelationExtractor.NERTag.fromString(obj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
if (objNER.isPresent() && KBPRelationExtractor.RelationType.plausiblyHasRelation(subjNER.get(), objNER.get())) {
// type check
KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(new Span(subjBegin, subjEnd), new Span(objBegin, objEnd), subjNER.get(), objNER.get(), doc.sentence(sentenceI));
// -- BEGIN Classify
Pair<String, Double> prediction = extractor.classify(input);
// Handle the classifier output
if (!KBPStatisticalExtractor.NO_RELATION.equals(prediction.first)) {
RelationTriple triple = new RelationTriple.WithLink(subj.get(CoreAnnotations.TokensAnnotation.class), mentionToCanonicalMention.get(subj).get(CoreAnnotations.TokensAnnotation.class), Collections.singletonList(new CoreLabel(new Word(prediction.first))), obj.get(CoreAnnotations.TokensAnnotation.class), mentionToCanonicalMention.get(obj).get(CoreAnnotations.TokensAnnotation.class), prediction.second, sentences.get(sentenceI).get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), subj.get(CoreAnnotations.WikipediaEntityAnnotation.class), obj.get(CoreAnnotations.WikipediaEntityAnnotation.class));
triples.add(triple);
}
}
}
}
}
// Set triples
annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(CoreAnnotations.KBPTriplesAnnotation.class, triples);
}
}
use of edu.stanford.nlp.coref.data.CorefChain in project CoreNLP by stanfordnlp.
the class ProtobufAnnotationSerializer method toProtoBuilder.
/**
* <p>
* The method to extend by subclasses of the Protobuf Annotator if custom additions are added to Tokens.
* In contrast to {@link ProtobufAnnotationSerializer#toProto(edu.stanford.nlp.ling.CoreLabel)}, this function
* returns a builder that can be extended.
* </p>
*
* @param doc The sentence to save to a protocol buffer
* @param keysToSerialize A set tracking which keys have been saved. It's important to remove any keys added to the proto
* from this set, as the code tracks annotations to ensure lossless serializationA set tracking which keys have been saved. It's important to remove any keys added to the proto*
* from this set, as the code tracks annotations to ensure lossless serialization.
*/
protected CoreNLPProtos.Document.Builder toProtoBuilder(Annotation doc, Set<Class<?>> keysToSerialize) {
CoreNLPProtos.Document.Builder builder = CoreNLPProtos.Document.newBuilder();
// Required fields
builder.setText(doc.get(TextAnnotation.class));
keysToSerialize.remove(TextAnnotation.class);
// Optional fields
if (doc.containsKey(SentencesAnnotation.class)) {
for (CoreMap sentence : doc.get(SentencesAnnotation.class)) {
builder.addSentence(toProto(sentence));
}
keysToSerialize.remove(SentencesAnnotation.class);
} else if (doc.containsKey(TokensAnnotation.class)) {
for (CoreLabel token : doc.get(TokensAnnotation.class)) {
builder.addSentencelessToken(toProto(token));
}
}
if (doc.containsKey(DocIDAnnotation.class)) {
builder.setDocID(doc.get(DocIDAnnotation.class));
keysToSerialize.remove(DocIDAnnotation.class);
}
if (doc.containsKey(DocDateAnnotation.class)) {
builder.setDocDate(doc.get(DocDateAnnotation.class));
keysToSerialize.remove(DocDateAnnotation.class);
}
if (doc.containsKey(CalendarAnnotation.class)) {
builder.setCalendar(doc.get(CalendarAnnotation.class).toInstant().toEpochMilli());
keysToSerialize.remove(CalendarAnnotation.class);
}
if (doc.containsKey(CorefChainAnnotation.class)) {
for (Map.Entry<Integer, CorefChain> chain : doc.get(CorefChainAnnotation.class).entrySet()) {
builder.addCorefChain(toProto(chain.getValue()));
}
keysToSerialize.remove(CorefChainAnnotation.class);
}
if (doc.containsKey(QuotationsAnnotation.class)) {
for (CoreMap quote : doc.get(QuotationsAnnotation.class)) {
builder.addQuote(toProtoQuote(quote));
}
keysToSerialize.remove(QuotationsAnnotation.class);
}
if (doc.containsKey(MentionsAnnotation.class)) {
for (CoreMap mention : doc.get(MentionsAnnotation.class)) {
builder.addMentions(toProtoMention(mention));
}
keysToSerialize.remove(MentionsAnnotation.class);
}
// add character info from segmenter
if (doc.containsKey(SegmenterCoreAnnotations.CharactersAnnotation.class)) {
for (CoreLabel c : doc.get(SegmenterCoreAnnotations.CharactersAnnotation.class)) {
builder.addCharacter(toProto(c));
}
keysToSerialize.remove(SegmenterCoreAnnotations.CharactersAnnotation.class);
}
// Return
return builder;
}
use of edu.stanford.nlp.coref.data.CorefChain in project CoreNLP by stanfordnlp.
the class TextOutputter method print.
/**
* The meat of the outputter
*/
private static void print(Annotation annotation, PrintWriter pw, Options options) throws IOException {
double beam = options.beamPrintingOption;
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
// Display docid if available
String docId = annotation.get(CoreAnnotations.DocIDAnnotation.class);
if (docId != null) {
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
int nSentences = (sentences != null) ? sentences.size() : 0;
int nTokens = (tokens != null) ? tokens.size() : 0;
pw.printf("Document: ID=%s (%d sentences, %d tokens)%n", docId, nSentences, nTokens);
}
// Display doctitle if available
String docTitle = annotation.get(CoreAnnotations.DocTitleAnnotation.class);
if (docTitle != null) {
pw.printf("Document Title: %s%n", docTitle);
}
// Display docdate if available
String docDate = annotation.get(CoreAnnotations.DocDateAnnotation.class);
if (docDate != null) {
pw.printf("Document Date: %s%n", docDate);
}
// Display doctype if available
String docType = annotation.get(CoreAnnotations.DocTypeAnnotation.class);
if (docType != null) {
pw.printf("Document Type: %s%n", docType);
}
// Display docsourcetype if available
String docSourceType = annotation.get(CoreAnnotations.DocSourceTypeAnnotation.class);
if (docSourceType != null) {
pw.printf("Document Source Type: %s%n", docSourceType);
}
// display each sentence in this annotation
if (sentences != null) {
for (int i = 0, sz = sentences.size(); i < sz; i++) {
CoreMap sentence = sentences.get(i);
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
String sentiment = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
if (sentiment == null) {
sentiment = "";
} else {
sentiment = ", sentiment: " + sentiment;
}
pw.printf("Sentence #%d (%d tokens%s):%n", (i + 1), tokens.size(), sentiment);
String text = sentence.get(CoreAnnotations.TextAnnotation.class);
pw.println(text);
// display the token-level annotations
String[] tokenAnnotations = { "Text", "PartOfSpeech", "Lemma", "Answer", "NamedEntityTag", "CharacterOffsetBegin", "CharacterOffsetEnd", "NormalizedNamedEntityTag", "Timex", "TrueCase", "TrueCaseText", "SentimentClass", "WikipediaEntity" };
for (CoreLabel token : tokens) {
pw.print(token.toShorterString(tokenAnnotations));
pw.println();
}
// display the parse tree for this sentence
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
if (tree != null) {
options.constituentTreePrinter.printTree(tree, pw);
}
// language which doesn't have dependencies, for example.
if (sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class) != null) {
pw.print(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class).toList());
pw.println();
}
// display MachineReading entities and relations
List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if (entities != null) {
pw.println("Extracted the following MachineReading entity mentions:");
for (EntityMention e : entities) {
pw.print('\t');
pw.println(e);
}
}
List<RelationMention> relations = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
if (relations != null) {
pw.println("Extracted the following MachineReading relation mentions:");
for (RelationMention r : relations) {
if (r.printableObject(beam)) {
pw.println(r);
}
}
}
// display OpenIE triples
Collection<RelationTriple> openieTriples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
if (openieTriples != null && openieTriples.size() > 0) {
pw.println("Extracted the following Open IE triples:");
for (RelationTriple triple : openieTriples) {
pw.println(OpenIE.tripleToString(triple, docId, sentence));
}
}
// display KBP triples
Collection<RelationTriple> kbpTriples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
if (kbpTriples != null && kbpTriples.size() > 0) {
pw.println("Extracted the following KBP triples:");
for (RelationTriple triple : kbpTriples) {
pw.println(triple.toString());
}
}
}
}
// display the old-style doc-level coref annotations
// this is not supported anymore!
//String corefAnno = annotation.get(CorefPLAnnotation.class);
//if(corefAnno != null) os.println(corefAnno);
// display the new-style coreference graph
Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains != null && sentences != null) {
for (CorefChain chain : corefChains.values()) {
CorefChain.CorefMention representative = chain.getRepresentativeMention();
boolean outputHeading = false;
for (CorefChain.CorefMention mention : chain.getMentionsInTextualOrder()) {
if (mention == representative)
continue;
if (!outputHeading) {
outputHeading = true;
pw.println("Coreference set:");
}
// all offsets start at 1!
pw.printf("\t(%d,%d,[%d,%d]) -> (%d,%d,[%d,%d]), that is: \"%s\" -> \"%s\"%n", mention.sentNum, mention.headIndex, mention.startIndex, mention.endIndex, representative.sentNum, representative.headIndex, representative.startIndex, representative.endIndex, mention.mentionSpan, representative.mentionSpan);
}
}
}
// display quotes if available
if (annotation.get(CoreAnnotations.QuotationsAnnotation.class) != null) {
pw.println("Extracted quotes: ");
List<CoreMap> allQuotes = QuoteAnnotator.gatherQuotes(annotation);
for (CoreMap quote : allQuotes) {
pw.printf("[QuotationIndexAnnotation=%d, CharacterOffsetBegin=%d, Text=%s]%n", quote.get(CoreAnnotations.QuotationIndexAnnotation.class), quote.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), quote.get(CoreAnnotations.TextAnnotation.class));
}
}
pw.flush();
}
Aggregations