Examples with Document - edu.stanford.nlp.simple.Document

Example 1 with Document

use of edu.stanford.nlp.simple.Document in project CoreNLP by stanfordnlp.

the class KBPAnnotator method annotate.

/**
 * Annotate this document for KBP relations.
 * @param annotation The document to annotate.
 */
@Override
public void annotate(Annotation annotation) {
    // get a list of sentences for this annotation
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    // Create simple document
    Document doc = new Document(kbpProperties, serializer.toProto(annotation));
    // Get the mentions in the document
    List<CoreMap> mentions = new ArrayList<>();
    for (CoreMap sentence : sentences) {
        mentions.addAll(sentence.get(CoreAnnotations.MentionsAnnotation.class));
    }
    // Compute coreferent clusters
    // (map an index to a KBP mention)
    Map<Pair<Integer, Integer>, CoreMap> mentionByStartIndex = new HashMap<>();
    for (CoreMap mention : mentions) {
        for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
            mentionByStartIndex.put(Pair.makePair(token.sentIndex(), token.index()), mention);
        }
    }
    // (collect coreferent KBP mentions)
    // map from canonical mention -> other mentions
    Map<CoreMap, Set<CoreMap>> mentionsMap = new HashMap<>();
    if (annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) {
        for (Map.Entry<Integer, CorefChain> chain : annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class).entrySet()) {
            CoreMap firstMention = null;
            for (CorefChain.CorefMention mention : chain.getValue().getMentionsInTextualOrder()) {
                CoreMap kbpMention = null;
                for (int i = mention.startIndex; i < mention.endIndex; ++i) {
                    if (mentionByStartIndex.containsKey(Pair.makePair(mention.sentNum - 1, i))) {
                        kbpMention = mentionByStartIndex.get(Pair.makePair(mention.sentNum - 1, i));
                        break;
                    }
                }
                if (firstMention == null) {
                    firstMention = kbpMention;
                }
                if (kbpMention != null) {
                    if (!mentionsMap.containsKey(firstMention)) {
                        mentionsMap.put(firstMention, new LinkedHashSet<>());
                    }
                    mentionsMap.get(firstMention).add(kbpMention);
                }
            }
        }
    }
    // (coreference acronyms)
    acronymMatch(mentions, mentionsMap);
    // (ensure valid NER tag for canonical mention)
    for (CoreMap key : new HashSet<>(mentionsMap.keySet())) {
        if (key.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null) {
            CoreMap newKey = null;
            for (CoreMap candidate : mentionsMap.get(key)) {
                if (candidate.get(CoreAnnotations.NamedEntityTagAnnotation.class) != null) {
                    newKey = candidate;
                    break;
                }
            }
            if (newKey != null) {
                mentionsMap.put(newKey, mentionsMap.remove(key));
            } else {
                // case: no mention in this chain has an NER tag.
                mentionsMap.remove(key);
            }
        }
    }
    // Propagate Entity Link
    for (Map.Entry<CoreMap, Set<CoreMap>> entry : mentionsMap.entrySet()) {
        String entityLink = entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class);
        if (entityLink != null) {
            for (CoreMap mention : entry.getValue()) {
                for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
                    token.set(CoreAnnotations.WikipediaEntityAnnotation.class, entityLink);
                }
            }
        }
    }
    // create a mapping of char offset pairs to KBPMention
    HashMap<Pair<Integer, Integer>, CoreMap> charOffsetToKBPMention = new HashMap<>();
    for (CoreMap mention : mentions) {
        int nerMentionCharBegin = mention.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int nerMentionCharEnd = mention.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        charOffsetToKBPMention.put(new Pair<>(nerMentionCharBegin, nerMentionCharEnd), mention);
    }
    // Create a canonical mention map
    Map<CoreMap, CoreMap> mentionToCanonicalMention;
    if (kbpLanguage.equals(LanguageInfo.HumanLanguage.SPANISH)) {
        mentionToCanonicalMention = spanishCorefSystem.canonicalMentionMapFromEntityMentions(mentions);
        if (VERBOSE) {
            log.info("---");
            log.info("basic spanish coref results");
            for (CoreMap originalMention : mentionToCanonicalMention.keySet()) {
                if (!originalMention.equals(mentionToCanonicalMention.get(originalMention))) {
                    log.info("mapped: " + originalMention + " to: " + mentionToCanonicalMention.get(originalMention));
                }
            }
        }
    } else {
        mentionToCanonicalMention = new HashMap<>();
    }
    // check if there is coref info
    Set<Map.Entry<Integer, CorefChain>> corefChains;
    if (annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null && !kbpLanguage.equals(LanguageInfo.HumanLanguage.SPANISH))
        corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class).entrySet();
    else
        corefChains = new HashSet<>();
    for (Map.Entry<Integer, CorefChain> indexCorefChainPair : corefChains) {
        CorefChain corefChain = indexCorefChainPair.getValue();
        Pair<List<CoreMap>, CoreMap> corefChainKBPMentionsAndBestIndex = corefChainToKBPMentions(corefChain, annotation, charOffsetToKBPMention);
        List<CoreMap> corefChainKBPMentions = corefChainKBPMentionsAndBestIndex.first();
        CoreMap bestKBPMentionForChain = corefChainKBPMentionsAndBestIndex.second();
        if (bestKBPMentionForChain != null) {
            for (CoreMap kbpMention : corefChainKBPMentions) {
                if (kbpMention != null) {
                    // System.err.println("---");
                    // ad hoc filters ; assume acceptable unless a filter blocks it
                    boolean acceptableLink = true;
                    // block people matches without a token overlap, exempting pronominal to non-pronominal
                    // good: Ashton --> Catherine Ashton
                    // good: she --> Catherine Ashton
                    // bad: Morsi --> Catherine Ashton
                    String kbpMentionNERTag = kbpMention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
                    String bestKBPMentionForChainNERTag = bestKBPMentionForChain.get(CoreAnnotations.NamedEntityTagAnnotation.class);
                    if (kbpMentionNERTag != null && bestKBPMentionForChainNERTag != null && kbpMentionNERTag.equals("PERSON") && bestKBPMentionForChainNERTag.equals("PERSON") && !kbpIsPronominalMention(kbpMention.get(CoreAnnotations.TokensAnnotation.class).get(0)) && !kbpIsPronominalMention(bestKBPMentionForChain.get(CoreAnnotations.TokensAnnotation.class).get(0))) {
                        // System.err.println("testing PERSON to PERSON coref link");
                        boolean tokenMatchFound = false;
                        for (CoreLabel kbpToken : kbpMention.get(CoreAnnotations.TokensAnnotation.class)) {
                            for (CoreLabel bestKBPToken : bestKBPMentionForChain.get(CoreAnnotations.TokensAnnotation.class)) {
                                if (kbpToken.word().toLowerCase().equals(bestKBPToken.word().toLowerCase())) {
                                    tokenMatchFound = true;
                                    break;
                                }
                            }
                            if (tokenMatchFound)
                                break;
                        }
                        if (!tokenMatchFound)
                            acceptableLink = false;
                    }
                    // check the coref link passed the filters
                    if (acceptableLink)
                        mentionToCanonicalMention.put(kbpMention, bestKBPMentionForChain);
                // System.err.println("kbp mention: " + kbpMention.get(CoreAnnotations.TextAnnotation.class));
                // System.err.println("coref mention: " + bestKBPMentionForChain.get(CoreAnnotations.TextAnnotation.class));
                }
            }
        }
    }
    // (add missing mentions)
    mentions.stream().filter(mention -> mentionToCanonicalMention.get(mention) == null).forEach(mention -> mentionToCanonicalMention.put(mention, mention));
    // handle acronym coreference
    HashMap<String, List<CoreMap>> acronymClusters = new HashMap<>();
    HashMap<String, List<CoreMap>> acronymInstances = new HashMap<>();
    for (CoreMap acronymMention : mentionToCanonicalMention.keySet()) {
        String acronymNERTag = acronymMention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
        if ((acronymMention == mentionToCanonicalMention.get(acronymMention)) && acronymNERTag != null && (acronymNERTag.equals(KBPRelationExtractor.NERTag.ORGANIZATION.name) || acronymNERTag.equals(KBPRelationExtractor.NERTag.LOCATION.name))) {
            String acronymText = acronymMention.get(CoreAnnotations.TextAnnotation.class);
            // define acronyms as not containing spaces (e.g. ACLU)
            if (!acronymText.contains(" ")) {
                int numCoreferentsChecked = 0;
                for (CoreMap coreferentMention : mentions) {
                    // only check first 1000
                    if (numCoreferentsChecked > 1000)
                        break;
                    // don't check a mention against itself
                    if (acronymMention == coreferentMention)
                        continue;
                    // don't check other mentions without " "
                    String coreferentText = coreferentMention.get(CoreAnnotations.TextAnnotation.class);
                    if (!coreferentText.contains(" "))
                        continue;
                    numCoreferentsChecked++;
                    List<String> coreferentTokenStrings = coreferentMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(CoreLabel::word).collect(Collectors.toList());
                    // afterwards find the best mention in acronymClusters, and match it to every mention in acronymInstances
                    if (AcronymMatcher.isAcronym(acronymText, coreferentTokenStrings)) {
                        if (!acronymClusters.containsKey(acronymText))
                            acronymClusters.put(acronymText, new ArrayList<>());
                        if (!acronymInstances.containsKey(acronymText))
                            acronymInstances.put(acronymText, new ArrayList<>());
                        acronymClusters.get(acronymText).add(coreferentMention);
                        acronymInstances.get(acronymText).add(acronymMention);
                    }
                }
            }
        }
    }
    // process each acronym (e.g. ACLU)
    for (String acronymText : acronymInstances.keySet()) {
        // find longest ORG or null
        CoreMap bestORG = null;
        for (CoreMap coreferentMention : acronymClusters.get(acronymText)) {
            if (!coreferentMention.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(KBPRelationExtractor.NERTag.ORGANIZATION.name))
                continue;
            if (bestORG == null)
                bestORG = coreferentMention;
            else if (coreferentMention.get(CoreAnnotations.TextAnnotation.class).length() > bestORG.get(CoreAnnotations.TextAnnotation.class).length())
                bestORG = coreferentMention;
        }
        // find longest LOC or null
        CoreMap bestLOC = null;
        for (CoreMap coreferentMention : acronymClusters.get(acronymText)) {
            if (!coreferentMention.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(KBPRelationExtractor.NERTag.LOCATION.name))
                continue;
            if (bestLOC == null)
                bestLOC = coreferentMention;
            else if (coreferentMention.get(CoreAnnotations.TextAnnotation.class).length() > bestLOC.get(CoreAnnotations.TextAnnotation.class).length())
                bestLOC = coreferentMention;
        }
        // link ACLU to "American Civil Liberties Union" ; make sure NER types match
        for (CoreMap acronymMention : acronymInstances.get(acronymText)) {
            String mentionType = acronymMention.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            if (mentionType.equals(KBPRelationExtractor.NERTag.ORGANIZATION.name) && bestORG != null)
                mentionToCanonicalMention.put(acronymMention, bestORG);
            if (mentionType.equals(KBPRelationExtractor.NERTag.LOCATION.name) && bestLOC != null)
                mentionToCanonicalMention.put(acronymMention, bestLOC);
        }
    }
    // Cluster mentions by sentence
    @SuppressWarnings("unchecked") List<CoreMap>[] mentionsBySentence = new List[annotation.get(CoreAnnotations.SentencesAnnotation.class).size()];
    for (int i = 0; i < mentionsBySentence.length; ++i) {
        mentionsBySentence[i] = new ArrayList<>();
    }
    for (CoreMap mention : mentionToCanonicalMention.keySet()) {
        mentionsBySentence[mention.get(CoreAnnotations.SentenceIndexAnnotation.class)].add(mention);
    }
    // Classify
    for (int sentenceI = 0; sentenceI < mentionsBySentence.length; ++sentenceI) {
        HashMap<String, RelationTriple> relationStringsToTriples = new HashMap<>();
        // the annotations
        List<RelationTriple> finalTriplesList = new ArrayList<>();
        List<CoreMap> candidates = mentionsBySentence[sentenceI];
        // determine sentence length
        int sentenceLength = annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).get(CoreAnnotations.TokensAnnotation.class).size();
        // check if sentence is too long, if it's too long don't run kbp
        if (maxLength != -1 && sentenceLength > maxLength) {
            // set the triples annotation to an empty list of RelationTriples
            annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(CoreAnnotations.KBPTriplesAnnotation.class, finalTriplesList);
            // continue to next sentence
            continue;
        }
        // sentence isn't too long, so continue processing this sentence
        for (int subjI = 0; subjI < candidates.size(); ++subjI) {
            CoreMap subj = candidates.get(subjI);
            int subjBegin = subj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
            int subjEnd = subj.get(CoreAnnotations.TokensAnnotation.class).get(subj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
            Optional<KBPRelationExtractor.NERTag> subjNER = KBPRelationExtractor.NERTag.fromString(subj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
            if (subjNER.isPresent()) {
                for (int objI = 0; objI < candidates.size(); ++objI) {
                    if (subjI == objI) {
                        continue;
                    }
                    if (Thread.interrupted()) {
                        throw new RuntimeInterruptedException();
                    }
                    CoreMap obj = candidates.get(objI);
                    int objBegin = obj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
                    int objEnd = obj.get(CoreAnnotations.TokensAnnotation.class).get(obj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
                    Optional<KBPRelationExtractor.NERTag> objNER = KBPRelationExtractor.NERTag.fromString(obj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
                    if (objNER.isPresent() && KBPRelationExtractor.RelationType.plausiblyHasRelation(subjNER.get(), objNER.get())) {
                        // type check
                        KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(new Span(subjBegin, subjEnd), new Span(objBegin, objEnd), subjNER.get(), objNER.get(), doc.sentence(sentenceI));
                        // -- BEGIN Classify
                        Pair<String, Double> prediction = extractor.classify(input);
                        // Handle the classifier output
                        if (!KBPStatisticalExtractor.NO_RELATION.equals(prediction.first)) {
                            RelationTriple triple = new RelationTriple.WithLink(subj.get(CoreAnnotations.TokensAnnotation.class), mentionToCanonicalMention.get(subj).get(CoreAnnotations.TokensAnnotation.class), Collections.singletonList(new CoreLabel(new Word(convertRelationNameToLatest(prediction.first)))), obj.get(CoreAnnotations.TokensAnnotation.class), mentionToCanonicalMention.get(obj).get(CoreAnnotations.TokensAnnotation.class), prediction.second, sentences.get(sentenceI).get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), subj.get(CoreAnnotations.WikipediaEntityAnnotation.class), obj.get(CoreAnnotations.WikipediaEntityAnnotation.class));
                            String tripleString = triple.subjectGloss() + "\t" + triple.relationGloss() + "\t" + triple.objectGloss();
                            // ad hoc checks for problems
                            boolean acceptableTriple = true;
                            if (triple.objectGloss().equals(triple.subjectGloss()) && triple.relationGloss().endsWith("alternate_names"))
                                acceptableTriple = false;
                            // different confidence scores, so we want to filter out the lower confidence versions
                            if (acceptableTriple && !relationStringsToTriples.containsKey(tripleString))
                                relationStringsToTriples.put(tripleString, triple);
                            else if (acceptableTriple && triple.confidence > relationStringsToTriples.get(tripleString).confidence)
                                relationStringsToTriples.put(tripleString, triple);
                        }
                    }
                }
            }
        }
        finalTriplesList = new ArrayList<>(relationStringsToTriples.values());
        // Set triples
        annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(CoreAnnotations.KBPTriplesAnnotation.class, finalTriplesList);
    }
}

Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CorefChain(edu.stanford.nlp.coref.data.CorefChain) IOUtils(edu.stanford.nlp.io.IOUtils) edu.stanford.nlp.util(edu.stanford.nlp.util) Redwood(edu.stanford.nlp.util.logging.Redwood) IOException(java.io.IOException) Document(edu.stanford.nlp.simple.Document) Collectors(java.util.stream.Collectors) LinearClassifier(edu.stanford.nlp.classify.LinearClassifier) Classifier(edu.stanford.nlp.classify.Classifier) Span(edu.stanford.nlp.ie.machinereading.structure.Span) edu.stanford.nlp.ie(edu.stanford.nlp.ie) Word(edu.stanford.nlp.ling.Word) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) WordLists(edu.stanford.nlp.coref.data.WordLists) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) Word(edu.stanford.nlp.ling.Word) Document(edu.stanford.nlp.simple.Document) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Span(edu.stanford.nlp.ie.machinereading.structure.Span) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations)

Aggregations

Classifier (edu.stanford.nlp.classify.Classifier)1 LinearClassifier (edu.stanford.nlp.classify.LinearClassifier)1 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)1 CorefChain (edu.stanford.nlp.coref.data.CorefChain)1 WordLists (edu.stanford.nlp.coref.data.WordLists)1 edu.stanford.nlp.ie (edu.stanford.nlp.ie)1 Span (edu.stanford.nlp.ie.machinereading.structure.Span)1 RelationTriple (edu.stanford.nlp.ie.util.RelationTriple)1 IOUtils (edu.stanford.nlp.io.IOUtils)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)1 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 Word (edu.stanford.nlp.ling.Word)1 TokenSequenceMatcher (edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher)1 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)1 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)1 Document (edu.stanford.nlp.simple.Document)1 edu.stanford.nlp.util (edu.stanford.nlp.util)1 Redwood (edu.stanford.nlp.util.logging.Redwood)1