Search in sources :

Example 1 with SpeakerInfo

use of edu.stanford.nlp.coref.data.SpeakerInfo in project CoreNLP by stanfordnlp.

the class ProtobufAnnotationSerializer method toProto.

public CoreNLPProtos.SpeakerInfo toProto(SpeakerInfo speakerInfo) {
    CoreNLPProtos.SpeakerInfo.Builder builder = CoreNLPProtos.SpeakerInfo.newBuilder();
    builder.setSpeakerName(speakerInfo.getSpeakerName());
    // mentionID's should be set by MentionAnnotator
    for (Mention m : speakerInfo.getMentions()) {
        builder.addMentions(m.mentionID);
    }
    return builder.build();
}
Also used : RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) Mention(edu.stanford.nlp.coref.data.Mention) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo)

Example 2 with SpeakerInfo

use of edu.stanford.nlp.coref.data.SpeakerInfo in project CoreNLP by stanfordnlp.

the class ProtobufAnnotationSerializer method fromProto.

/**
   * Returns a complete document, intended to mimic a document passes as input to
   * {@link ProtobufAnnotationSerializer#toProto(Annotation)} as closely as possible.
   * That is, most common fields are serialized, but there is not guarantee that custom additions
   * will be saved and retrieved.
   *
   * @param proto The protocol buffer to read the document from.
   * @return An Annotation corresponding to the read protobuf.
   */
@SuppressWarnings("deprecation")
public Annotation fromProto(CoreNLPProtos.Document proto) {
    if (Thread.interrupted()) {
        throw new RuntimeInterruptedException();
    }
    // Set text
    Annotation ann = new Annotation(proto.getText());
    // if there are characters, add characters
    if (proto.getCharacterCount() > 0) {
        List<CoreLabel> docChars = new ArrayList<CoreLabel>();
        for (CoreNLPProtos.Token c : proto.getCharacterList()) {
            docChars.add(fromProto(c));
        }
        ann.set(SegmenterCoreAnnotations.CharactersAnnotation.class, docChars);
    }
    // Add tokens
    List<CoreLabel> tokens = new ArrayList<>();
    if (proto.getSentenceCount() > 0) {
        // Populate the tokens from the sentence
        for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) {
            // It's conceivable that the sentences are not contiguous -- pad this with nulls
            while (sentence.hasTokenOffsetBegin() && tokens.size() < sentence.getTokenOffsetBegin()) {
                tokens.add(null);
            }
            // Read the sentence
            for (CoreNLPProtos.Token token : sentence.getTokenList()) {
                CoreLabel coreLabel = fromProto(token);
                // Set docid
                if (proto.hasDocID()) {
                    coreLabel.setDocID(proto.getDocID());
                }
                if (token.hasTokenBeginIndex() && token.hasTokenEndIndex()) {
                    // This is usually true, if enough annotators are defined
                    while (tokens.size() < sentence.getTokenOffsetEnd()) {
                        tokens.add(null);
                    }
                    for (int i = token.getTokenBeginIndex(); i < token.getTokenEndIndex(); ++i) {
                        tokens.set(token.getTokenBeginIndex(), coreLabel);
                    }
                } else {
                    // Assume this token spans a single token, and just add it to the tokens list
                    tokens.add(coreLabel);
                }
            }
        }
    } else if (proto.getSentencelessTokenCount() > 0) {
        // Eek -- no sentences. Try to recover tokens directly
        if (proto.getSentencelessTokenCount() > 0) {
            for (CoreNLPProtos.Token token : proto.getSentencelessTokenList()) {
                CoreLabel coreLabel = fromProto(token);
                // Set docid
                if (proto.hasDocID()) {
                    coreLabel.setDocID(proto.getDocID());
                }
                tokens.add(coreLabel);
            }
        }
    }
    if (!tokens.isEmpty()) {
        ann.set(TokensAnnotation.class, tokens);
    }
    // Add sentences
    List<CoreMap> sentences = new ArrayList<>(proto.getSentenceCount());
    for (int sentIndex = 0; sentIndex < proto.getSentenceCount(); ++sentIndex) {
        CoreNLPProtos.Sentence sentence = proto.getSentence(sentIndex);
        CoreMap map = fromProtoNoTokens(sentence);
        if (!tokens.isEmpty() && sentence.hasTokenOffsetBegin() && sentence.hasTokenOffsetEnd() && map.get(TokensAnnotation.class) == null) {
            // Set tokens for sentence
            int tokenBegin = sentence.getTokenOffsetBegin();
            int tokenEnd = sentence.getTokenOffsetEnd();
            assert tokenBegin <= tokens.size() && tokenBegin <= tokenEnd;
            assert tokenEnd <= tokens.size();
            map.set(TokensAnnotation.class, tokens.subList(tokenBegin, tokenEnd));
            // Set sentence index + token index + paragraph index
            for (int i = tokenBegin; i < tokenEnd; ++i) {
                tokens.get(i).setSentIndex(sentIndex);
                tokens.get(i).setIndex(i - sentence.getTokenOffsetBegin() + 1);
                if (sentence.hasParagraph()) {
                    tokens.get(i).set(ParagraphAnnotation.class, sentence.getParagraph());
                }
            }
            // Set text
            int characterBegin = sentence.getCharacterOffsetBegin();
            int characterEnd = sentence.getCharacterOffsetEnd();
            if (characterEnd <= proto.getText().length()) {
                // The usual case -- get the text from the document text
                map.set(TextAnnotation.class, proto.getText().substring(characterBegin, characterEnd));
            } else {
                // The document text is wrong -- guess the text from the tokens
                map.set(TextAnnotation.class, recoverOriginalText(tokens.subList(tokenBegin, tokenEnd), sentence));
            }
        }
        // End iteration
        sentences.add(map);
    }
    if (!sentences.isEmpty()) {
        ann.set(SentencesAnnotation.class, sentences);
    }
    // Set DocID
    String docid = null;
    if (proto.hasDocID()) {
        docid = proto.getDocID();
        ann.set(DocIDAnnotation.class, docid);
    }
    // Set reference time
    if (proto.hasDocDate()) {
        ann.set(DocDateAnnotation.class, proto.getDocDate());
    }
    if (proto.hasCalendar()) {
        GregorianCalendar calendar = new GregorianCalendar();
        calendar.setTimeInMillis(proto.getCalendar());
        ann.set(CalendarAnnotation.class, calendar);
    }
    // Set coref chain
    Map<Integer, CorefChain> corefChains = new HashMap<>();
    for (CoreNLPProtos.CorefChain chainProto : proto.getCorefChainList()) {
        CorefChain chain = fromProto(chainProto, ann);
        corefChains.put(chain.getChainID(), chain);
    }
    if (!corefChains.isEmpty()) {
        ann.set(CorefChainAnnotation.class, corefChains);
    }
    // hashes to access Mentions , later in this method need to add speakerInfo to Mention
    // so we need to create id -> Mention, CoreNLPProtos.Mention maps to do this, since SpeakerInfo could reference
    // any Mention in doc
    HashMap<Integer, Mention> idToMention = new HashMap<>();
    HashMap<Integer, CoreNLPProtos.Mention> idToProtoMention = new HashMap<>();
    // Set things in the sentence that need a document context.
    for (int sentenceIndex = 0; sentenceIndex < proto.getSentenceCount(); ++sentenceIndex) {
        CoreNLPProtos.Sentence sentence = proto.getSentenceList().get(sentenceIndex);
        CoreMap map = sentences.get(sentenceIndex);
        List<CoreLabel> sentenceTokens = map.get(TokensAnnotation.class);
        // Set dependency graphs
        if (sentence.hasBasicDependencies()) {
            map.set(BasicDependenciesAnnotation.class, fromProto(sentence.getBasicDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasCollapsedDependencies()) {
            map.set(CollapsedDependenciesAnnotation.class, fromProto(sentence.getCollapsedDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasCollapsedCCProcessedDependencies()) {
            map.set(CollapsedCCProcessedDependenciesAnnotation.class, fromProto(sentence.getCollapsedCCProcessedDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasAlternativeDependencies()) {
            map.set(AlternativeDependenciesAnnotation.class, fromProto(sentence.getAlternativeDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasEnhancedDependencies()) {
            map.set(EnhancedDependenciesAnnotation.class, fromProto(sentence.getEnhancedDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasEnhancedPlusPlusDependencies()) {
            map.set(EnhancedPlusPlusDependenciesAnnotation.class, fromProto(sentence.getEnhancedPlusPlusDependencies(), sentenceTokens, docid));
        }
        // Set entailed sentences
        if (sentence.getEntailedSentenceCount() > 0) {
            Set<SentenceFragment> entailedSentences = sentence.getEntailedSentenceList().stream().map(frag -> fromProto(frag, map.get(EnhancedPlusPlusDependenciesAnnotation.class))).collect(Collectors.toSet());
            map.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, entailedSentences);
        }
        if (sentence.getEntailedClauseCount() > 0) {
            Set<SentenceFragment> entailedClauses = sentence.getEntailedClauseList().stream().map(frag -> fromProto(frag, map.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toSet());
            map.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, entailedClauses);
        }
        // Set relation triples
        if (sentence.getOpenieTripleCount() > 0) {
            List<RelationTriple> triples = new ArrayList<>();
            for (CoreNLPProtos.RelationTriple triple : sentence.getOpenieTripleList()) {
                triples.add(fromProto(triple, ann, sentenceIndex));
            }
            map.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, triples);
        }
        // Redo some light annotation
        if (map.containsKey(TokensAnnotation.class) && (!sentence.hasHasNumerizedTokensAnnotation() || sentence.getHasNumerizedTokensAnnotation())) {
            map.set(NumerizedTokensAnnotation.class, NumberNormalizer.findAndMergeNumbers(map));
        }
        // add the CoreLabel and IndexedWord info to each mention
        // when Mentions are serialized, just storing the index in the sentence for CoreLabels and IndexedWords
        // this is the point where the de-serialized sentence has tokens
        int mentionInt = 0;
        for (CoreNLPProtos.Mention protoMention : sentence.getMentionsForCorefList()) {
            // get the mention
            Mention mentionToUpdate = map.get(CorefMentionsAnnotation.class).get(mentionInt);
            // store these in hash for more processing later in this method
            idToMention.put(mentionToUpdate.mentionID, mentionToUpdate);
            idToProtoMention.put(mentionToUpdate.mentionID, protoMention);
            // update the values
            int headIndexedWordIndex = protoMention.getHeadIndexedWord().getTokenIndex();
            if (headIndexedWordIndex >= 0) {
                mentionToUpdate.headIndexedWord = new IndexedWord(sentenceTokens.get(protoMention.getHeadIndexedWord().getTokenIndex()));
                mentionToUpdate.headIndexedWord.setCopyCount(protoMention.getHeadIndexedWord().getCopyCount());
            }
            int dependingVerbIndex = protoMention.getDependingVerb().getTokenIndex();
            if (dependingVerbIndex >= 0) {
                mentionToUpdate.dependingVerb = new IndexedWord(sentenceTokens.get(protoMention.getDependingVerb().getTokenIndex()));
                mentionToUpdate.dependingVerb.setCopyCount(protoMention.getDependingVerb().getCopyCount());
            }
            int headWordIndex = protoMention.getHeadWord().getTokenIndex();
            if (headWordIndex >= 0) {
                mentionToUpdate.headWord = sentenceTokens.get(protoMention.getHeadWord().getTokenIndex());
            }
            mentionToUpdate.sentenceWords = new ArrayList<>();
            for (CoreNLPProtos.IndexedWord clp : protoMention.getSentenceWordsList()) {
                int ti = clp.getTokenIndex();
                mentionToUpdate.sentenceWords.add(sentenceTokens.get(ti));
            }
            mentionToUpdate.originalSpan = new ArrayList<>();
            for (CoreNLPProtos.IndexedWord clp : protoMention.getOriginalSpanList()) {
                int ti = clp.getTokenIndex();
                mentionToUpdate.originalSpan.add(sentenceTokens.get(ti));
            }
            if (protoMention.getHasBasicDependency()) {
                mentionToUpdate.basicDependency = map.get(BasicDependenciesAnnotation.class);
            }
            if (protoMention.getHasEnhancedDepenedncy()) {
                mentionToUpdate.enhancedDependency = map.get(EnhancedDependenciesAnnotation.class);
            }
            if (protoMention.getHasContextParseTree()) {
                mentionToUpdate.contextParseTree = map.get(TreeAnnotation.class);
            }
            // move on to next mention
            mentionInt++;
        }
    }
    // Set quotes
    List<CoreMap> quotes = proto.getQuoteList().stream().map(quote -> fromProto(quote, tokens)).collect(Collectors.toList());
    if (!quotes.isEmpty()) {
        ann.set(QuotationsAnnotation.class, quotes);
    }
    // Set NERmention
    List<CoreMap> mentions = proto.getMentionsList().stream().map(this::fromProto).collect(Collectors.toList());
    if (!mentions.isEmpty()) {
        ann.set(MentionsAnnotation.class, mentions);
    }
    // also add all the Set<Mention>
    for (int mentionID : idToMention.keySet()) {
        // this is the Mention message corresponding to this Mention
        Mention mentionToUpdate = idToMention.get(mentionID);
        CoreNLPProtos.Mention correspondingProtoMention = idToProtoMention.get(mentionID);
        if (!correspondingProtoMention.hasSpeakerInfo()) {
            // so just continue to next Mention
            continue;
        }
        // if we're here we know a speakerInfo was stored
        SpeakerInfo speakerInfo = fromProto(correspondingProtoMention.getSpeakerInfo());
        // MentionID is ID in document, 0, 1, 2, etc...
        for (int speakerInfoMentionID : correspondingProtoMention.getSpeakerInfo().getMentionsList()) {
            speakerInfo.addMention(idToMention.get(speakerInfoMentionID));
        }
        // now the SpeakerInfo for this Mention should be fully restored
        mentionToUpdate.speakerInfo = speakerInfo;
    }
    // Return
    return ann;
}
Also used : ExtractionObject(edu.stanford.nlp.ie.machinereading.structure.ExtractionObject) java.util(java.util) CorefChain(edu.stanford.nlp.coref.data.CorefChain) edu.stanford.nlp.util(edu.stanford.nlp.util) Tree(edu.stanford.nlp.trees.Tree) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) MachineReadingAnnotations(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations) TimeAnnotations(edu.stanford.nlp.time.TimeAnnotations) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) Mention(edu.stanford.nlp.coref.data.Mention) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) Language(edu.stanford.nlp.international.Language) RNNCoreAnnotations(edu.stanford.nlp.neural.rnn.RNNCoreAnnotations) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) LabeledScoredTreeNode(edu.stanford.nlp.trees.LabeledScoredTreeNode) Timex(edu.stanford.nlp.time.Timex) IndexedWord(edu.stanford.nlp.ling.IndexedWord) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) GrammaticalRelation(edu.stanford.nlp.trees.GrammaticalRelation) edu.stanford.nlp.naturalli(edu.stanford.nlp.naturalli) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) NumberNormalizer(edu.stanford.nlp.ie.NumberNormalizer) Collectors(java.util.stream.Collectors) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) SegmenterCoreAnnotations(edu.stanford.nlp.ling.SegmenterCoreAnnotations) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Word(edu.stanford.nlp.ling.Word) java.io(java.io) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) Mention(edu.stanford.nlp.coref.data.Mention) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SegmenterCoreAnnotations(edu.stanford.nlp.ling.SegmenterCoreAnnotations) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 3 with SpeakerInfo

use of edu.stanford.nlp.coref.data.SpeakerInfo in project CoreNLP by stanfordnlp.

the class CorefRules method antecedentMatchesMentionSpeakerAnnotation.

/**
   * The antecedent matches the speaker annotation found in the mention
   */
public static boolean antecedentMatchesMentionSpeakerAnnotation(Mention mention, Mention ant, Document document) {
    if (mention.headWord == null) {
        return false;
    }
    String speaker = mention.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
    if (speaker == null) {
        return false;
    }
    SpeakerInfo speakerInfo = (document != null) ? document.getSpeakerInfo(speaker) : null;
    if (speakerInfo != null) {
        return (mentionMatchesSpeaker(ant, speakerInfo, false));
    }
    // regex split
    if (speaker.indexOf(" ") >= 0) {
        // Perhaps we could optimize this, too, but that would be trickier
        for (String s : WHITESPACE_PATTERN.split(speaker)) {
            if (ant.headString.equalsIgnoreCase(s))
                return true;
        }
    } else {
        if (ant.headString.equalsIgnoreCase(speaker))
            return true;
    }
    return false;
}
Also used : CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo)

Example 4 with SpeakerInfo

use of edu.stanford.nlp.coref.data.SpeakerInfo in project CoreNLP by stanfordnlp.

the class CorefRules method getSpeakerClusterId.

/**
   * Given the name of a speaker, returns the coref cluster id it belongs to (-1 if no cluster)
   * @param document The document to search in
   * @param speakerString The name to search for
   * @return cluster id
   */
public static int getSpeakerClusterId(Document document, String speakerString) {
    int speakerClusterId = -1;
    // try looking up cluster id from speaker info
    SpeakerInfo speakerInfo = null;
    if (speakerString != null) {
        speakerInfo = document.getSpeakerInfo(speakerString);
        if (speakerInfo != null) {
            speakerClusterId = speakerInfo.getCorefClusterId();
        }
    }
    if (speakerClusterId < 0 && speakerString != null && NumberMatchingRegex.isDecimalInteger(speakerString)) {
        // speakerString is number so is mention id
        try {
            int speakerMentionId = Integer.parseInt(speakerString);
            Mention mention = document.predictedMentionsByID.get(speakerMentionId);
            if (mention != null) {
                speakerClusterId = mention.corefClusterID;
                if (speakerInfo != null)
                    speakerInfo.addMention(mention);
            }
        } catch (Exception e) {
        }
    }
    return speakerClusterId;
}
Also used : Mention(edu.stanford.nlp.coref.data.Mention) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo)

Aggregations

SpeakerInfo (edu.stanford.nlp.coref.data.SpeakerInfo)4 Mention (edu.stanford.nlp.coref.data.Mention)3 EntityMention (edu.stanford.nlp.ie.machinereading.structure.EntityMention)2 RelationMention (edu.stanford.nlp.ie.machinereading.structure.RelationMention)2 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)2 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)1 CorefChain (edu.stanford.nlp.coref.data.CorefChain)1 Dictionaries (edu.stanford.nlp.coref.data.Dictionaries)1 NumberNormalizer (edu.stanford.nlp.ie.NumberNormalizer)1 ExtractionObject (edu.stanford.nlp.ie.machinereading.structure.ExtractionObject)1 MachineReadingAnnotations (edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations)1 Span (edu.stanford.nlp.ie.machinereading.structure.Span)1 RelationTriple (edu.stanford.nlp.ie.util.RelationTriple)1 Language (edu.stanford.nlp.international.Language)1 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 IndexedWord (edu.stanford.nlp.ling.IndexedWord)1 SegmenterCoreAnnotations (edu.stanford.nlp.ling.SegmenterCoreAnnotations)1 Word (edu.stanford.nlp.ling.Word)1 edu.stanford.nlp.naturalli (edu.stanford.nlp.naturalli)1