Search in sources :

Example 36 with Mention

use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.

the class ProtobufAnnotationSerializer method toProto.

public CoreNLPProtos.Mention toProto(Mention mention) {
    // create the builder
    CoreNLPProtos.Mention.Builder builder = CoreNLPProtos.Mention.newBuilder();
    // set enums
    if (mention.mentionType != null) {
        builder.setMentionType(mention.mentionType.name());
    }
    if (mention.gender != null) {
        builder.setGender(mention.gender.name());
    }
    if (mention.number != null) {
        builder.setNumber(mention.number.name());
    }
    if (mention.animacy != null) {
        builder.setAnimacy(mention.animacy.name());
    }
    if (mention.person != null) {
        builder.setPerson(mention.person.name());
    }
    if (mention.headString != null) {
        builder.setHeadString(mention.headString);
    }
    if (mention.nerString != null) {
        builder.setNerString(mention.nerString);
    }
    builder.setStartIndex(mention.startIndex);
    builder.setEndIndex(mention.endIndex);
    builder.setHeadIndex(mention.headIndex);
    builder.setMentionID(mention.mentionID);
    builder.setOriginalRef(mention.originalRef);
    builder.setGoldCorefClusterID(mention.goldCorefClusterID);
    builder.setCorefClusterID(mention.corefClusterID);
    builder.setMentionNum(mention.mentionNum);
    builder.setSentNum(mention.sentNum);
    builder.setUtter(mention.utter);
    builder.setParagraph(mention.paragraph);
    builder.setIsSubject(mention.isSubject);
    builder.setIsDirectObject(mention.isDirectObject);
    builder.setIsIndirectObject(mention.isIndirectObject);
    builder.setIsPrepositionObject(mention.isPrepositionObject);
    builder.setHasTwin(mention.hasTwin);
    builder.setGeneric(mention.generic);
    builder.setIsSingleton(mention.isSingleton);
    // handle the two sets of Strings
    if (mention.dependents != null) {
        mention.dependents.forEach(builder::addDependents);
    }
    if (mention.preprocessedTerms != null) {
        mention.preprocessedTerms.forEach(builder::addPreprocessedTerms);
    }
    // set IndexedWords by storing (sentence number, token index) pairs
    builder.setDependingVerb(createIndexedWordProtoFromIW(mention.dependingVerb));
    builder.setHeadIndexedWord(createIndexedWordProtoFromIW(mention.headIndexedWord));
    builder.setHeadWord(createIndexedWordProtoFromCL(mention.headWord));
    // add positions for each CoreLabel in sentence
    if (mention.sentenceWords != null) {
        for (CoreLabel cl : mention.sentenceWords) {
            builder.addSentenceWords(createIndexedWordProtoFromCL(cl));
        }
    }
    if (mention.originalSpan != null) {
        for (CoreLabel cl : mention.originalSpan) {
            builder.addOriginalSpan(createIndexedWordProtoFromCL(cl));
        }
    }
    // flag if this Mention should get basicDependency, collapsedDependency, and contextParseTree or not
    builder.setHasBasicDependency((mention.basicDependency != null));
    builder.setHasEnhancedDepenedncy((mention.enhancedDependency != null));
    builder.setHasContextParseTree((mention.contextParseTree != null));
    // handle the sets of Mentions, just store mentionID
    if (mention.appositions != null) {
        for (Mention m : mention.appositions) {
            builder.addAppositions(m.mentionID);
        }
    }
    if (mention.predicateNominatives != null) {
        for (Mention m : mention.predicateNominatives) {
            builder.addPredicateNominatives(m.mentionID);
        }
    }
    if (mention.relativePronouns != null) {
        for (Mention m : mention.relativePronouns) {
            builder.addRelativePronouns(m.mentionID);
        }
    }
    if (mention.listMembers != null) {
        for (Mention m : mention.listMembers) {
            builder.addListMembers(m.mentionID);
        }
    }
    if (mention.belongToLists != null) {
        for (Mention m : mention.belongToLists) {
            builder.addBelongToLists(m.mentionID);
        }
    }
    if (mention.speakerInfo != null) {
        builder.setSpeakerInfo(toProto(mention.speakerInfo));
    }
    return builder.build();
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) Mention(edu.stanford.nlp.coref.data.Mention) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention)

Example 37 with Mention

use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.

the class ProtobufAnnotationSerializer method fromProto.

/**
   * Returns a complete document, intended to mimic a document passes as input to
   * {@link ProtobufAnnotationSerializer#toProto(Annotation)} as closely as possible.
   * That is, most common fields are serialized, but there is not guarantee that custom additions
   * will be saved and retrieved.
   *
   * @param proto The protocol buffer to read the document from.
   * @return An Annotation corresponding to the read protobuf.
   */
@SuppressWarnings("deprecation")
public Annotation fromProto(CoreNLPProtos.Document proto) {
    if (Thread.interrupted()) {
        throw new RuntimeInterruptedException();
    }
    // Set text
    Annotation ann = new Annotation(proto.getText());
    // if there are characters, add characters
    if (proto.getCharacterCount() > 0) {
        List<CoreLabel> docChars = new ArrayList<CoreLabel>();
        for (CoreNLPProtos.Token c : proto.getCharacterList()) {
            docChars.add(fromProto(c));
        }
        ann.set(SegmenterCoreAnnotations.CharactersAnnotation.class, docChars);
    }
    // Add tokens
    List<CoreLabel> tokens = new ArrayList<>();
    if (proto.getSentenceCount() > 0) {
        // Populate the tokens from the sentence
        for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) {
            // It's conceivable that the sentences are not contiguous -- pad this with nulls
            while (sentence.hasTokenOffsetBegin() && tokens.size() < sentence.getTokenOffsetBegin()) {
                tokens.add(null);
            }
            // Read the sentence
            for (CoreNLPProtos.Token token : sentence.getTokenList()) {
                CoreLabel coreLabel = fromProto(token);
                // Set docid
                if (proto.hasDocID()) {
                    coreLabel.setDocID(proto.getDocID());
                }
                if (token.hasTokenBeginIndex() && token.hasTokenEndIndex()) {
                    // This is usually true, if enough annotators are defined
                    while (tokens.size() < sentence.getTokenOffsetEnd()) {
                        tokens.add(null);
                    }
                    for (int i = token.getTokenBeginIndex(); i < token.getTokenEndIndex(); ++i) {
                        tokens.set(token.getTokenBeginIndex(), coreLabel);
                    }
                } else {
                    // Assume this token spans a single token, and just add it to the tokens list
                    tokens.add(coreLabel);
                }
            }
        }
    } else if (proto.getSentencelessTokenCount() > 0) {
        // Eek -- no sentences. Try to recover tokens directly
        if (proto.getSentencelessTokenCount() > 0) {
            for (CoreNLPProtos.Token token : proto.getSentencelessTokenList()) {
                CoreLabel coreLabel = fromProto(token);
                // Set docid
                if (proto.hasDocID()) {
                    coreLabel.setDocID(proto.getDocID());
                }
                tokens.add(coreLabel);
            }
        }
    }
    if (!tokens.isEmpty()) {
        ann.set(TokensAnnotation.class, tokens);
    }
    // Add sentences
    List<CoreMap> sentences = new ArrayList<>(proto.getSentenceCount());
    for (int sentIndex = 0; sentIndex < proto.getSentenceCount(); ++sentIndex) {
        CoreNLPProtos.Sentence sentence = proto.getSentence(sentIndex);
        CoreMap map = fromProtoNoTokens(sentence);
        if (!tokens.isEmpty() && sentence.hasTokenOffsetBegin() && sentence.hasTokenOffsetEnd() && map.get(TokensAnnotation.class) == null) {
            // Set tokens for sentence
            int tokenBegin = sentence.getTokenOffsetBegin();
            int tokenEnd = sentence.getTokenOffsetEnd();
            assert tokenBegin <= tokens.size() && tokenBegin <= tokenEnd;
            assert tokenEnd <= tokens.size();
            map.set(TokensAnnotation.class, tokens.subList(tokenBegin, tokenEnd));
            // Set sentence index + token index + paragraph index
            for (int i = tokenBegin; i < tokenEnd; ++i) {
                tokens.get(i).setSentIndex(sentIndex);
                tokens.get(i).setIndex(i - sentence.getTokenOffsetBegin() + 1);
                if (sentence.hasParagraph()) {
                    tokens.get(i).set(ParagraphAnnotation.class, sentence.getParagraph());
                }
            }
            // Set text
            int characterBegin = sentence.getCharacterOffsetBegin();
            int characterEnd = sentence.getCharacterOffsetEnd();
            if (characterEnd <= proto.getText().length()) {
                // The usual case -- get the text from the document text
                map.set(TextAnnotation.class, proto.getText().substring(characterBegin, characterEnd));
            } else {
                // The document text is wrong -- guess the text from the tokens
                map.set(TextAnnotation.class, recoverOriginalText(tokens.subList(tokenBegin, tokenEnd), sentence));
            }
        }
        // End iteration
        sentences.add(map);
    }
    if (!sentences.isEmpty()) {
        ann.set(SentencesAnnotation.class, sentences);
    }
    // Set DocID
    String docid = null;
    if (proto.hasDocID()) {
        docid = proto.getDocID();
        ann.set(DocIDAnnotation.class, docid);
    }
    // Set reference time
    if (proto.hasDocDate()) {
        ann.set(DocDateAnnotation.class, proto.getDocDate());
    }
    if (proto.hasCalendar()) {
        GregorianCalendar calendar = new GregorianCalendar();
        calendar.setTimeInMillis(proto.getCalendar());
        ann.set(CalendarAnnotation.class, calendar);
    }
    // Set coref chain
    Map<Integer, CorefChain> corefChains = new HashMap<>();
    for (CoreNLPProtos.CorefChain chainProto : proto.getCorefChainList()) {
        CorefChain chain = fromProto(chainProto, ann);
        corefChains.put(chain.getChainID(), chain);
    }
    if (!corefChains.isEmpty()) {
        ann.set(CorefChainAnnotation.class, corefChains);
    }
    // hashes to access Mentions , later in this method need to add speakerInfo to Mention
    // so we need to create id -> Mention, CoreNLPProtos.Mention maps to do this, since SpeakerInfo could reference
    // any Mention in doc
    HashMap<Integer, Mention> idToMention = new HashMap<>();
    HashMap<Integer, CoreNLPProtos.Mention> idToProtoMention = new HashMap<>();
    // Set things in the sentence that need a document context.
    for (int sentenceIndex = 0; sentenceIndex < proto.getSentenceCount(); ++sentenceIndex) {
        CoreNLPProtos.Sentence sentence = proto.getSentenceList().get(sentenceIndex);
        CoreMap map = sentences.get(sentenceIndex);
        List<CoreLabel> sentenceTokens = map.get(TokensAnnotation.class);
        // Set dependency graphs
        if (sentence.hasBasicDependencies()) {
            map.set(BasicDependenciesAnnotation.class, fromProto(sentence.getBasicDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasCollapsedDependencies()) {
            map.set(CollapsedDependenciesAnnotation.class, fromProto(sentence.getCollapsedDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasCollapsedCCProcessedDependencies()) {
            map.set(CollapsedCCProcessedDependenciesAnnotation.class, fromProto(sentence.getCollapsedCCProcessedDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasAlternativeDependencies()) {
            map.set(AlternativeDependenciesAnnotation.class, fromProto(sentence.getAlternativeDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasEnhancedDependencies()) {
            map.set(EnhancedDependenciesAnnotation.class, fromProto(sentence.getEnhancedDependencies(), sentenceTokens, docid));
        }
        if (sentence.hasEnhancedPlusPlusDependencies()) {
            map.set(EnhancedPlusPlusDependenciesAnnotation.class, fromProto(sentence.getEnhancedPlusPlusDependencies(), sentenceTokens, docid));
        }
        // Set entailed sentences
        if (sentence.getEntailedSentenceCount() > 0) {
            Set<SentenceFragment> entailedSentences = sentence.getEntailedSentenceList().stream().map(frag -> fromProto(frag, map.get(EnhancedPlusPlusDependenciesAnnotation.class))).collect(Collectors.toSet());
            map.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, entailedSentences);
        }
        if (sentence.getEntailedClauseCount() > 0) {
            Set<SentenceFragment> entailedClauses = sentence.getEntailedClauseList().stream().map(frag -> fromProto(frag, map.get(CollapsedDependenciesAnnotation.class))).collect(Collectors.toSet());
            map.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, entailedClauses);
        }
        // Set relation triples
        if (sentence.getOpenieTripleCount() > 0) {
            List<RelationTriple> triples = new ArrayList<>();
            for (CoreNLPProtos.RelationTriple triple : sentence.getOpenieTripleList()) {
                triples.add(fromProto(triple, ann, sentenceIndex));
            }
            map.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, triples);
        }
        // Redo some light annotation
        if (map.containsKey(TokensAnnotation.class) && (!sentence.hasHasNumerizedTokensAnnotation() || sentence.getHasNumerizedTokensAnnotation())) {
            map.set(NumerizedTokensAnnotation.class, NumberNormalizer.findAndMergeNumbers(map));
        }
        // add the CoreLabel and IndexedWord info to each mention
        // when Mentions are serialized, just storing the index in the sentence for CoreLabels and IndexedWords
        // this is the point where the de-serialized sentence has tokens
        int mentionInt = 0;
        for (CoreNLPProtos.Mention protoMention : sentence.getMentionsForCorefList()) {
            // get the mention
            Mention mentionToUpdate = map.get(CorefMentionsAnnotation.class).get(mentionInt);
            // store these in hash for more processing later in this method
            idToMention.put(mentionToUpdate.mentionID, mentionToUpdate);
            idToProtoMention.put(mentionToUpdate.mentionID, protoMention);
            // update the values
            int headIndexedWordIndex = protoMention.getHeadIndexedWord().getTokenIndex();
            if (headIndexedWordIndex >= 0) {
                mentionToUpdate.headIndexedWord = new IndexedWord(sentenceTokens.get(protoMention.getHeadIndexedWord().getTokenIndex()));
                mentionToUpdate.headIndexedWord.setCopyCount(protoMention.getHeadIndexedWord().getCopyCount());
            }
            int dependingVerbIndex = protoMention.getDependingVerb().getTokenIndex();
            if (dependingVerbIndex >= 0) {
                mentionToUpdate.dependingVerb = new IndexedWord(sentenceTokens.get(protoMention.getDependingVerb().getTokenIndex()));
                mentionToUpdate.dependingVerb.setCopyCount(protoMention.getDependingVerb().getCopyCount());
            }
            int headWordIndex = protoMention.getHeadWord().getTokenIndex();
            if (headWordIndex >= 0) {
                mentionToUpdate.headWord = sentenceTokens.get(protoMention.getHeadWord().getTokenIndex());
            }
            mentionToUpdate.sentenceWords = new ArrayList<>();
            for (CoreNLPProtos.IndexedWord clp : protoMention.getSentenceWordsList()) {
                int ti = clp.getTokenIndex();
                mentionToUpdate.sentenceWords.add(sentenceTokens.get(ti));
            }
            mentionToUpdate.originalSpan = new ArrayList<>();
            for (CoreNLPProtos.IndexedWord clp : protoMention.getOriginalSpanList()) {
                int ti = clp.getTokenIndex();
                mentionToUpdate.originalSpan.add(sentenceTokens.get(ti));
            }
            if (protoMention.getHasBasicDependency()) {
                mentionToUpdate.basicDependency = map.get(BasicDependenciesAnnotation.class);
            }
            if (protoMention.getHasEnhancedDepenedncy()) {
                mentionToUpdate.enhancedDependency = map.get(EnhancedDependenciesAnnotation.class);
            }
            if (protoMention.getHasContextParseTree()) {
                mentionToUpdate.contextParseTree = map.get(TreeAnnotation.class);
            }
            // move on to next mention
            mentionInt++;
        }
    }
    // Set quotes
    List<CoreMap> quotes = proto.getQuoteList().stream().map(quote -> fromProto(quote, tokens)).collect(Collectors.toList());
    if (!quotes.isEmpty()) {
        ann.set(QuotationsAnnotation.class, quotes);
    }
    // Set NERmention
    List<CoreMap> mentions = proto.getMentionsList().stream().map(this::fromProto).collect(Collectors.toList());
    if (!mentions.isEmpty()) {
        ann.set(MentionsAnnotation.class, mentions);
    }
    // also add all the Set<Mention>
    for (int mentionID : idToMention.keySet()) {
        // this is the Mention message corresponding to this Mention
        Mention mentionToUpdate = idToMention.get(mentionID);
        CoreNLPProtos.Mention correspondingProtoMention = idToProtoMention.get(mentionID);
        if (!correspondingProtoMention.hasSpeakerInfo()) {
            // so just continue to next Mention
            continue;
        }
        // if we're here we know a speakerInfo was stored
        SpeakerInfo speakerInfo = fromProto(correspondingProtoMention.getSpeakerInfo());
        // MentionID is ID in document, 0, 1, 2, etc...
        for (int speakerInfoMentionID : correspondingProtoMention.getSpeakerInfo().getMentionsList()) {
            speakerInfo.addMention(idToMention.get(speakerInfoMentionID));
        }
        // now the SpeakerInfo for this Mention should be fully restored
        mentionToUpdate.speakerInfo = speakerInfo;
    }
    // Return
    return ann;
}
Also used : ExtractionObject(edu.stanford.nlp.ie.machinereading.structure.ExtractionObject) java.util(java.util) CorefChain(edu.stanford.nlp.coref.data.CorefChain) edu.stanford.nlp.util(edu.stanford.nlp.util) Tree(edu.stanford.nlp.trees.Tree) Dictionaries(edu.stanford.nlp.coref.data.Dictionaries) MachineReadingAnnotations(edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations) TimeAnnotations(edu.stanford.nlp.time.TimeAnnotations) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) Mention(edu.stanford.nlp.coref.data.Mention) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) Language(edu.stanford.nlp.international.Language) RNNCoreAnnotations(edu.stanford.nlp.neural.rnn.RNNCoreAnnotations) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) LabeledScoredTreeNode(edu.stanford.nlp.trees.LabeledScoredTreeNode) Timex(edu.stanford.nlp.time.Timex) IndexedWord(edu.stanford.nlp.ling.IndexedWord) TreeCoreAnnotations(edu.stanford.nlp.trees.TreeCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) GrammaticalRelation(edu.stanford.nlp.trees.GrammaticalRelation) edu.stanford.nlp.naturalli(edu.stanford.nlp.naturalli) SentimentCoreAnnotations(edu.stanford.nlp.sentiment.SentimentCoreAnnotations) NumberNormalizer(edu.stanford.nlp.ie.NumberNormalizer) Collectors(java.util.stream.Collectors) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) SegmenterCoreAnnotations(edu.stanford.nlp.ling.SegmenterCoreAnnotations) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Word(edu.stanford.nlp.ling.Word) java.io(java.io) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationMention(edu.stanford.nlp.ie.machinereading.structure.RelationMention) Mention(edu.stanford.nlp.coref.data.Mention) EntityMention(edu.stanford.nlp.ie.machinereading.structure.EntityMention) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SegmenterCoreAnnotations(edu.stanford.nlp.ling.SegmenterCoreAnnotations) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SpeakerInfo(edu.stanford.nlp.coref.data.SpeakerInfo) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 38 with Mention

use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.

the class NeuralCorefDataExporter method process.

@Override
public void process(int id, Document document) {
    JsonArrayBuilder clusters = Json.createArrayBuilder();
    for (CorefCluster gold : document.goldCorefClusters.values()) {
        JsonArrayBuilder c = Json.createArrayBuilder();
        for (Mention m : gold.corefMentions) {
            c.add(m.mentionID);
        }
        clusters.add(c.build());
    }
    goldClusterWriter.println(Json.createObjectBuilder().add(String.valueOf(id), clusters.build()).build());
    Map<Pair<Integer, Integer>, Boolean> mentionPairs = CorefUtils.getLabeledMentionPairs(document);
    List<Mention> mentionsList = CorefUtils.getSortedMentions(document);
    Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
    for (int i = 0; i < mentionsList.size(); i++) {
        Mention m = mentionsList.get(i);
        List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
        if (withIndex == null) {
            withIndex = new ArrayList<>();
            mentionsByHeadIndex.put(m.headIndex, withIndex);
        }
        withIndex.add(m);
    }
    JsonObjectBuilder docFeatures = Json.createObjectBuilder();
    docFeatures.add("doc_id", id);
    docFeatures.add("type", document.docType == DocType.ARTICLE ? 1 : 0);
    docFeatures.add("source", document.docInfo.get("DOC_ID").split("/")[0]);
    JsonArrayBuilder sentences = Json.createArrayBuilder();
    for (CoreMap sentence : document.annotation.get(SentencesAnnotation.class)) {
        sentences.add(getSentenceArray(sentence.get(CoreAnnotations.TokensAnnotation.class)));
    }
    JsonObjectBuilder mentions = Json.createObjectBuilder();
    for (Mention m : document.predictedMentionsByID.values()) {
        Iterator<SemanticGraphEdge> iterator = m.enhancedDependency.incomingEdgeIterator(m.headIndexedWord);
        SemanticGraphEdge relation = iterator.hasNext() ? iterator.next() : null;
        String depRelation = relation == null ? "no-parent" : relation.getRelation().toString();
        String depParent = relation == null ? "<missing>" : relation.getSource().word();
        mentions.add(String.valueOf(m.mentionNum), Json.createObjectBuilder().add("doc_id", id).add("mention_id", m.mentionID).add("mention_num", m.mentionNum).add("sent_num", m.sentNum).add("start_index", m.startIndex).add("end_index", m.endIndex).add("head_index", m.headIndex).add("mention_type", m.mentionType.toString()).add("dep_relation", depRelation).add("dep_parent", depParent).add("sentence", getSentenceArray(m.sentenceWords)).add("contained-in-other-mention", mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)) ? 1 : 0).build());
    }
    JsonArrayBuilder featureNames = Json.createArrayBuilder().add("same-speaker").add("antecedent-is-mention-speaker").add("mention-is-antecedent-speaker").add("relaxed-head-match").add("exact-string-match").add("relaxed-string-match");
    JsonObjectBuilder features = Json.createObjectBuilder();
    JsonObjectBuilder labels = Json.createObjectBuilder();
    for (Map.Entry<Pair<Integer, Integer>, Boolean> e : mentionPairs.entrySet()) {
        Mention m1 = document.predictedMentionsByID.get(e.getKey().first);
        Mention m2 = document.predictedMentionsByID.get(e.getKey().second);
        String key = m1.mentionNum + " " + m2.mentionNum;
        JsonArrayBuilder builder = Json.createArrayBuilder();
        for (int val : CategoricalFeatureExtractor.pairwiseFeatures(document, m1, m2, dictionaries, conll)) {
            builder.add(val);
        }
        features.add(key, builder.build());
        labels.add(key, e.getValue() ? 1 : 0);
    }
    JsonObject docData = Json.createObjectBuilder().add("sentences", sentences.build()).add("mentions", mentions.build()).add("labels", labels.build()).add("pair_feature_names", featureNames.build()).add("pair_features", features.build()).add("document_features", docFeatures.build()).build();
    dataWriter.println(docData);
}
Also used : HashMap(java.util.HashMap) JsonObject(javax.json.JsonObject) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) ArrayList(java.util.ArrayList) List(java.util.List) JsonArrayBuilder(javax.json.JsonArrayBuilder) JsonObjectBuilder(javax.json.JsonObjectBuilder) CoreMap(edu.stanford.nlp.util.CoreMap) HashMap(java.util.HashMap) Map(java.util.Map) CoreMap(edu.stanford.nlp.util.CoreMap) Pair(edu.stanford.nlp.util.Pair)

Example 39 with Mention

use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.

the class FeatureExtractor method extract.

public DocumentExamples extract(int id, Document document, Map<Pair<Integer, Integer>, Boolean> labeledPairs, Compressor<String> compressor) {
    List<Mention> mentionsList = CorefUtils.getSortedMentions(document);
    Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
    for (Mention m : mentionsList) {
        List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
        if (withIndex == null) {
            withIndex = new ArrayList<>();
            mentionsByHeadIndex.put(m.headIndex, withIndex);
        }
        withIndex.add(m);
    }
    Map<Integer, Mention> mentions = document.predictedMentionsByID;
    List<Example> examples = new ArrayList<>();
    Set<Integer> mentionsToExtract = new HashSet<>();
    for (Map.Entry<Pair<Integer, Integer>, Boolean> pair : labeledPairs.entrySet()) {
        Mention m1 = mentions.get(pair.getKey().first);
        Mention m2 = mentions.get(pair.getKey().second);
        mentionsToExtract.add(m1.mentionID);
        mentionsToExtract.add(m2.mentionID);
        CompressedFeatureVector features = compressor.compress(getFeatures(document, m1, m2));
        examples.add(new Example(id, m1, m2, pair.getValue() ? 1.0 : 0.0, features));
    }
    Map<Integer, CompressedFeatureVector> mentionFeatures = new HashMap<>();
    for (int mentionID : mentionsToExtract) {
        mentionFeatures.put(mentionID, compressor.compress(getFeatures(document, document.predictedMentionsByID.get(mentionID), mentionsByHeadIndex)));
    }
    return new DocumentExamples(id, examples, mentionFeatures);
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Mention(edu.stanford.nlp.coref.data.Mention) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet) Pair(edu.stanford.nlp.util.Pair)

Example 40 with Mention

use of edu.stanford.nlp.coref.data.Mention in project CoreNLP by stanfordnlp.

the class MetadataWriter method process.

@Override
public void process(int id, Document document) {
    // Mention types
    mentionTypes.put(id, document.predictedMentionsByID.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString())));
    // Gold clusters
    List<List<Integer>> clusters = new ArrayList<>();
    for (CorefCluster c : document.goldCorefClusters.values()) {
        List<Integer> cluster = new ArrayList<>();
        for (Mention m : c.getCorefMentions()) {
            cluster.add(m.mentionID);
        }
        clusters.add(cluster);
    }
    goldClusters.put(id, clusters);
    // Word counting
    if (countWords && mentionPairs.containsKey(id)) {
        Set<Pair<Integer, Integer>> pairs = mentionPairs.get(id).keySet();
        Set<Integer> mentions = new HashSet<>();
        for (Pair<Integer, Integer> pair : pairs) {
            mentions.add(pair.first);
            mentions.add(pair.second);
            Mention m1 = document.predictedMentionsByID.get(pair.first);
            Mention m2 = document.predictedMentionsByID.get(pair.second);
            wordCounts.incrementCount("h_" + m1.headWord.word().toLowerCase() + "_" + m2.headWord.word().toLowerCase());
        }
        Map<Integer, List<CoreLabel>> sentences = new HashMap<>();
        for (int mention : mentions) {
            Mention m = document.predictedMentionsByID.get(mention);
            if (!sentences.containsKey(m.sentNum)) {
                sentences.put(m.sentNum, m.sentenceWords);
            }
        }
        for (List<CoreLabel> sentence : sentences.values()) {
            for (int i = 0; i < sentence.size(); i++) {
                CoreLabel cl = sentence.get(i);
                if (cl == null) {
                    continue;
                }
                String w = cl.word().toLowerCase();
                wordCounts.incrementCount(w);
                if (i > 0) {
                    CoreLabel clp = sentence.get(i - 1);
                    if (clp == null) {
                        continue;
                    }
                    String wp = clp.word().toLowerCase();
                    wordCounts.incrementCount(wp + "_" + w);
                }
            }
        }
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CorefCluster(edu.stanford.nlp.coref.data.CorefCluster) Mention(edu.stanford.nlp.coref.data.Mention) ArrayList(java.util.ArrayList) List(java.util.List) Pair(edu.stanford.nlp.util.Pair) HashSet(java.util.HashSet)

Aggregations

Mention (edu.stanford.nlp.coref.data.Mention)62 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)27 CoreLabel (edu.stanford.nlp.ling.CoreLabel)27 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)21 ArrayList (java.util.ArrayList)20 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)17 CoreMap (edu.stanford.nlp.util.CoreMap)17 List (java.util.List)15 Tree (edu.stanford.nlp.trees.Tree)14 IntPair (edu.stanford.nlp.util.IntPair)14 CorefCluster (edu.stanford.nlp.coref.data.CorefCluster)12 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)10 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)9 EntityMention (edu.stanford.nlp.ie.machinereading.structure.EntityMention)7 RelationMention (edu.stanford.nlp.ie.machinereading.structure.RelationMention)7 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)6 Map (java.util.Map)6