Search in sources :

Example 11 with RelationTriple

use of edu.stanford.nlp.ie.util.RelationTriple in project CoreNLP by stanfordnlp.

the class ClauseSplitter method train.

/**
   * Train a clause searcher factory. That is, train a classifier for which arcs should be
   * new clauses.
   *
   * @param trainingData The training data. This is a stream of triples of:
   *                     <ol>
   *                       <li>The sentence containing a known extraction.</li>
   *                       <li>The span of the subject in the sentence, as a token span.</li>
   *                       <li>The span of the object in the sentence, as a token span.</li>
   *                     </ol>
   * @param modelPath The path to save the model to. This is useful for {@link ClauseSplitter#load(String)}.
   * @param trainingDataDump The path to save the training data, as a set of labeled featurized datums.
   * @param featurizer The featurizer to use for this classifier.
   *
   * @return A factory for creating searchers from a given dependency tree.
   */
static ClauseSplitter train(Stream<Pair<CoreMap, Collection<Pair<Span, Span>>>> trainingData, Optional<File> modelPath, Optional<File> trainingDataDump, Featurizer featurizer) {
    // Parse options
    LinearClassifierFactory<ClauseClassifierLabel, String> factory = new LinearClassifierFactory<>();
    // Generally useful objects
    OpenIE openie = new OpenIE(PropertiesUtils.asProperties("splitter.nomodel", "true", "optimizefor", "GENERAL"));
    WeightedDataset<ClauseClassifierLabel, String> dataset = new WeightedDataset<>();
    AtomicInteger numExamplesProcessed = new AtomicInteger(0);
    final Optional<PrintWriter> datasetDumpWriter = trainingDataDump.map(file -> {
        try {
            return new PrintWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(trainingDataDump.get()))));
        } catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    });
    // Step 1: Loop over data
    forceTrack("Training inference");
    trainingData.forEach(rawExample -> {
        CoreMap sentence = rawExample.first;
        Collection<Pair<Span, Span>> spans = rawExample.second;
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
        ClauseSplitterSearchProblem problem = new ClauseSplitterSearchProblem(tree, true);
        problem.search(fragmentAndScore -> {
            List<Counter<String>> features = fragmentAndScore.second;
            SentenceFragment fragment = fragmentAndScore.third.get();
            Set<RelationTriple> extractions = new HashSet<>(openie.relationsInFragments(openie.entailmentsFromClause(fragment)));
            Trilean correct = Trilean.FALSE;
            RELATION_TRIPLE_LOOP: for (RelationTriple extraction : extractions) {
                Span subjectGuess = Span.fromValues(extraction.subject.get(0).index() - 1, extraction.subject.get(extraction.subject.size() - 1).index());
                Span objectGuess = Span.fromValues(extraction.object.get(0).index() - 1, extraction.object.get(extraction.object.size() - 1).index());
                for (Pair<Span, Span> candidateGold : spans) {
                    Span subjectSpan = candidateGold.first;
                    Span objectSpan = candidateGold.second;
                    if ((subjectGuess.equals(subjectSpan) && objectGuess.equals(objectSpan)) || (subjectGuess.equals(objectSpan) && objectGuess.equals(subjectSpan))) {
                        correct = Trilean.TRUE;
                        break RELATION_TRIPLE_LOOP;
                    } else if (Util.nerOverlap(tokens, subjectSpan, subjectGuess) && Util.nerOverlap(tokens, objectSpan, objectGuess) || Util.nerOverlap(tokens, subjectSpan, objectGuess) && Util.nerOverlap(tokens, objectSpan, subjectGuess)) {
                        if (!correct.isTrue()) {
                            correct = Trilean.TRUE;
                            break RELATION_TRIPLE_LOOP;
                        }
                    } else {
                        if (!correct.isTrue()) {
                            correct = Trilean.UNKNOWN;
                            break RELATION_TRIPLE_LOOP;
                        }
                    }
                }
            }
            if (!features.isEmpty()) {
                List<Pair<Counter<String>, ClauseClassifierLabel>> decisionsToAddAsDatums = new ArrayList<>();
                if (correct.isTrue()) {
                    for (int i = 0; i < features.size(); ++i) {
                        if (i == features.size() - 1) {
                            decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_SPLIT));
                        } else {
                            decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_INTERM));
                        }
                    }
                } else if (correct.isFalse()) {
                    decisionsToAddAsDatums.add(Pair.makePair(features.get(features.size() - 1), ClauseClassifierLabel.NOT_A_CLAUSE));
                } else if (correct.isUnknown()) {
                    boolean isSimpleSplit = false;
                    for (Counter<String> feats : features) {
                        if (featurizer.isSimpleSplit(feats)) {
                            isSimpleSplit = true;
                            break;
                        }
                    }
                    if (isSimpleSplit) {
                        for (int i = 0; i < features.size(); ++i) {
                            if (i == features.size() - 1) {
                                decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_SPLIT));
                            } else {
                                decisionsToAddAsDatums.add(Pair.makePair(features.get(i), ClauseClassifierLabel.CLAUSE_INTERM));
                            }
                        }
                    }
                }
                for (Pair<Counter<String>, ClauseClassifierLabel> decision : decisionsToAddAsDatums) {
                    RVFDatum<ClauseClassifierLabel, String> datum = new RVFDatum<>(decision.first);
                    datum.setLabel(decision.second);
                    if (datasetDumpWriter.isPresent()) {
                        datasetDumpWriter.get().println(decision.second + "\t" + StringUtils.join(decision.first.entrySet().stream().map(entry -> entry.getKey() + "->" + entry.getValue()), ";"));
                    }
                    dataset.add(datum);
                }
            }
            return true;
        }, new LinearClassifier<>(new ClassicCounter<>()), Collections.emptyMap(), featurizer, 10000);
        if (numExamplesProcessed.incrementAndGet() % 100 == 0) {
            log("processed " + numExamplesProcessed + " training sentences: " + dataset.size() + " datums");
        }
    });
    endTrack("Training inference");
    // Close the file
    if (datasetDumpWriter.isPresent()) {
        datasetDumpWriter.get().close();
    }
    // Step 2: Train classifier
    forceTrack("Training");
    Classifier<ClauseClassifierLabel, String> fullClassifier = factory.trainClassifier(dataset);
    endTrack("Training");
    if (modelPath.isPresent()) {
        Pair<Classifier<ClauseClassifierLabel, String>, Featurizer> toSave = Pair.makePair(fullClassifier, featurizer);
        try {
            IOUtils.writeObjectToFile(toSave, modelPath.get());
            log("SUCCESS: wrote model to " + modelPath.get().getPath());
        } catch (IOException e) {
            log("ERROR: failed to save model to path: " + modelPath.get().getPath());
            err(e);
        }
    }
    // Step 3: Check accuracy of classifier
    forceTrack("Training accuracy");
    dataset.randomize(42L);
    Util.dumpAccuracy(fullClassifier, dataset);
    endTrack("Training accuracy");
    int numFolds = 5;
    forceTrack(numFolds + " fold cross-validation");
    for (int fold = 0; fold < numFolds; ++fold) {
        forceTrack("Fold " + (fold + 1));
        forceTrack("Training");
        Pair<GeneralDataset<ClauseClassifierLabel, String>, GeneralDataset<ClauseClassifierLabel, String>> foldData = dataset.splitOutFold(fold, numFolds);
        Classifier<ClauseClassifierLabel, String> classifier = factory.trainClassifier(foldData.first);
        endTrack("Training");
        forceTrack("Test");
        Util.dumpAccuracy(classifier, foldData.second);
        endTrack("Test");
        endTrack("Fold " + (fold + 1));
    }
    endTrack(numFolds + " fold cross-validation");
    // Step 5: return factory
    return (tree, truth) -> new ClauseSplitterSearchProblem(tree, truth, Optional.of(fullClassifier), Optional.of(featurizer));
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) IOUtils(edu.stanford.nlp.io.IOUtils) edu.stanford.nlp.util(edu.stanford.nlp.util) BiFunction(java.util.function.BiFunction) Redwood(edu.stanford.nlp.util.logging.Redwood) Util(edu.stanford.nlp.util.logging.Redwood.Util) Span(edu.stanford.nlp.ie.machinereading.structure.Span) Counter(edu.stanford.nlp.stats.Counter) Stream(java.util.stream.Stream) java.io(java.io) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) edu.stanford.nlp.classify(edu.stanford.nlp.classify) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) GZIPOutputStream(java.util.zip.GZIPOutputStream) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) RVFDatum(edu.stanford.nlp.ling.RVFDatum) ClauseSplitterSearchProblem(edu.stanford.nlp.naturalli.ClauseSplitterSearchProblem) Counter(edu.stanford.nlp.stats.Counter) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) GZIPOutputStream(java.util.zip.GZIPOutputStream) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) ClauseSplitterSearchProblem(edu.stanford.nlp.naturalli.ClauseSplitterSearchProblem) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) Span(edu.stanford.nlp.ie.machinereading.structure.Span) RVFDatum(edu.stanford.nlp.ling.RVFDatum) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter)

Example 12 with RelationTriple

use of edu.stanford.nlp.ie.util.RelationTriple in project CoreNLP by stanfordnlp.

the class OpenIE method annotateSentence.

/**
   * <p>
   *   Annotate a single sentence.
   * </p>
   * <p>
   *   This annotator will, in particular, set the {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.EntailedSentencesAnnotation}
   *   and {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.RelationTriplesAnnotation} annotations.
   * </p>
   */
@SuppressWarnings("unchecked")
public void annotateSentence(CoreMap sentence, Map<CoreLabel, List<CoreLabel>> canonicalMentionMap) {
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if (tokens.size() < 2) {
        // Short sentence. Skip annotating it.
        sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, Collections.emptyList());
        if (!stripEntailments) {
            sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, Collections.emptySet());
        }
    } else {
        // Get the dependency tree
        SemanticGraph parse = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class);
        if (parse == null) {
            parse = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
        }
        if (parse == null) {
            throw new IllegalStateException("Cannot run OpenIE without a parse tree!");
        }
        // Clean the tree
        parse = new SemanticGraph(parse);
        Util.cleanTree(parse);
        // Resolve Coreference
        SemanticGraph canonicalizedParse = parse;
        if (resolveCoref && !canonicalMentionMap.isEmpty()) {
            canonicalizedParse = canonicalizeCoref(parse, canonicalMentionMap);
        }
        // Run OpenIE
        // (clauses)
        // note: uses coref-canonicalized parse
        List<SentenceFragment> clauses = clausesInSentence(canonicalizedParse, true);
        // (entailment)
        Set<SentenceFragment> fragments = entailmentsFromClauses(clauses);
        // (segment)
        // note: uses non-coref-canonicalized parse!
        List<RelationTriple> extractions = segmenter.extract(parse, tokens);
        extractions.addAll(relationsInFragments(fragments, sentence));
        // Set the annotations
        sentence.set(NaturalLogicAnnotations.EntailedClausesAnnotation.class, new HashSet<>(clauses));
        sentence.set(NaturalLogicAnnotations.EntailedSentencesAnnotation.class, fragments);
        sentence.set(NaturalLogicAnnotations.RelationTriplesAnnotation.class, // uniq the extractions
        new ArrayList<>(new HashSet<>(extractions)));
        if (stripEntailments) {
            sentence.remove(NaturalLogicAnnotations.EntailedSentencesAnnotation.class);
        }
    }
}
Also used : SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph)

Example 13 with RelationTriple

use of edu.stanford.nlp.ie.util.RelationTriple in project CoreNLP by stanfordnlp.

the class OpenIEDemo method main.

public static void main(String[] args) throws Exception {
    // Create the Stanford CoreNLP pipeline
    Properties props = PropertiesUtils.asProperties("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    // Annotate an example document.
    String text;
    if (args.length > 0) {
        text = IOUtils.slurpFile(args[0]);
    } else {
        text = "Obama was born in Hawaii. He is our president.";
    }
    Annotation doc = new Annotation(text);
    pipeline.annotate(doc);
    // Loop over sentences in the document
    int sentNo = 0;
    for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
        System.out.println("Sentence #" + ++sentNo + ": " + sentence.get(CoreAnnotations.TextAnnotation.class));
        // Print SemanticGraph
        System.out.println(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST));
        // Get the OpenIE triples for the sentence
        Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
        // Print the triples
        for (RelationTriple triple : triples) {
            System.out.println(triple.confidence + "\t" + triple.subjectLemmaGloss() + "\t" + triple.relationLemmaGloss() + "\t" + triple.objectLemmaGloss());
        }
        // Alternately, to only run e.g., the clause splitter:
        List<SentenceFragment> clauses = new OpenIE(props).clausesInSentence(sentence);
        for (SentenceFragment clause : clauses) {
            System.out.println(clause.parseTree.toString(SemanticGraph.OutputFormat.LIST));
        }
        System.out.println();
    }
}
Also used : SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) Properties(java.util.Properties) StanfordCoreNLP(edu.stanford.nlp.pipeline.StanfordCoreNLP) Annotation(edu.stanford.nlp.pipeline.Annotation) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 14 with RelationTriple

use of edu.stanford.nlp.ie.util.RelationTriple in project CoreNLP by stanfordnlp.

the class RelationTripleSegmenter method segmentACL.

/**
   * Same as {@link RelationTripleSegmenter#segmentVerb}, but with ACL clauses.
   * This is a bit out of the ordinary, logic-wise, so it sits in its own function.
   */
private Optional<RelationTriple> segmentACL(SemanticGraph parse, Optional<Double> confidence, boolean consumeAll) {
    IndexedWord subject = parse.getFirstRoot();
    Optional<List<IndexedWord>> subjectSpan = getValidSubjectChunk(parse, subject, Optional.of("acl"));
    if (subjectSpan.isPresent()) {
        // found a valid subject
        for (SemanticGraphEdge edgeFromSubj : parse.outgoingEdgeIterable(subject)) {
            if ("acl".equals(edgeFromSubj.getRelation().toString())) {
                // found a valid relation
                IndexedWord relation = edgeFromSubj.getDependent();
                List<IndexedWord> relationSpan = new ArrayList<>();
                relationSpan.add(relation);
                List<IndexedWord> objectSpan = new ArrayList<>();
                List<IndexedWord> ppSpan = new ArrayList<>();
                Optional<IndexedWord> pp = Optional.empty();
                // Get other arguments
                for (SemanticGraphEdge edgeFromRel : parse.outgoingEdgeIterable(relation)) {
                    String rel = edgeFromRel.getRelation().toString();
                    // Collect adverbs
                    if ("advmod".equals(rel)) {
                        Optional<List<IndexedWord>> advSpan = getValidAdverbChunk(parse, edgeFromRel.getDependent(), Optional.empty());
                        if (!advSpan.isPresent()) {
                            // bad adverb span!
                            return Optional.empty();
                        }
                        relationSpan.addAll(advSpan.get());
                    } else // Collect object
                    if (rel.endsWith("obj")) {
                        if (!objectSpan.isEmpty()) {
                            // duplicate objects!
                            return Optional.empty();
                        }
                        Optional<List<IndexedWord>> maybeObjSpan = getValidObjectChunk(parse, edgeFromRel.getDependent(), Optional.empty());
                        if (!maybeObjSpan.isPresent()) {
                            // bad object span!
                            return Optional.empty();
                        }
                        objectSpan.addAll(maybeObjSpan.get());
                    } else // Collect pp
                    if (rel.startsWith("nmod:")) {
                        if (!ppSpan.isEmpty()) {
                            // duplicate objects!
                            return Optional.empty();
                        }
                        Optional<List<IndexedWord>> maybePPSpan = getValidObjectChunk(parse, edgeFromRel.getDependent(), Optional.of("case"));
                        if (!maybePPSpan.isPresent()) {
                            // bad object span!
                            return Optional.empty();
                        }
                        ppSpan.addAll(maybePPSpan.get());
                        // Add the actual preposition, if we can find it
                        for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(edgeFromRel.getDependent())) {
                            if ("case".equals(edge.getRelation().toString())) {
                                pp = Optional.of(edge.getDependent());
                            }
                        }
                    } else if (consumeAll) {
                        // bad edge out of the relation
                        return Optional.empty();
                    }
                }
                // (canonicalize the triple to be subject; relation; object, folding in the PP)
                if (!ppSpan.isEmpty() && !objectSpan.isEmpty()) {
                    relationSpan.addAll(objectSpan);
                    objectSpan = ppSpan;
                } else if (!ppSpan.isEmpty()) {
                    objectSpan = ppSpan;
                }
                // (last error checks -- shouldn't ever fire)
                if (!subjectSpan.isPresent() || subjectSpan.get().isEmpty() || relationSpan.isEmpty() || objectSpan.isEmpty()) {
                    return Optional.empty();
                }
                // (sort the relation span)
                Collections.sort(relationSpan, (a, b) -> {
                    double val = a.pseudoPosition() - b.pseudoPosition();
                    if (val < 0) {
                        return -1;
                    }
                    if (val > 0) {
                        return 1;
                    } else {
                        return 0;
                    }
                });
                // (add in the PP node, if it exists)
                if (pp.isPresent()) {
                    relationSpan.add(pp.get());
                }
                // (success!)
                RelationTriple.WithTree extraction = new RelationTriple.WithTree(subjectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), relationSpan.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), objectSpan.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), parse, confidence.orElse(1.0));
                return Optional.of(extraction);
            }
        }
    }
    // Nothing found; return
    return Optional.empty();
}
Also used : SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 15 with RelationTriple

use of edu.stanford.nlp.ie.util.RelationTriple in project CoreNLP by stanfordnlp.

the class KBPAnnotator method annotate.

/**
   * Annotate this document for KBP relations.
   * @param annotation The document to annotate.
   */
@Override
public void annotate(Annotation annotation) {
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    // Annotate with NER
    //casedNER.annotate(annotation);
    //caselessNER.annotate(annotation);
    // Annotate with Mentions
    entityMentionAnnotator.annotate(annotation);
    // Create simple document
    Document doc = new Document(kbpProperties, serializer.toProto(annotation));
    // Get the mentions in the document
    List<CoreMap> mentions = new ArrayList<>();
    for (CoreMap sentence : sentences) {
        mentions.addAll(sentence.get(CoreAnnotations.MentionsAnnotation.class));
    }
    List<CoreMap> pronounMentions = annotatePronominalMentions(annotation);
    mentions.addAll(pronounMentions);
    // Compute coreferent clusters
    // (map an index to a KBP mention)
    Map<Pair<Integer, Integer>, CoreMap> mentionByStartIndex = new HashMap<>();
    for (CoreMap mention : mentions) {
        for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
            mentionByStartIndex.put(Pair.makePair(token.sentIndex(), token.index()), mention);
        }
    }
    // (collect coreferent KBP mentions)
    // map from canonical mention -> other mentions
    Map<CoreMap, Set<CoreMap>> mentionsMap = new HashMap<>();
    if (annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class) != null) {
        for (Map.Entry<Integer, CorefChain> chain : annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class).entrySet()) {
            CoreMap firstMention = null;
            for (CorefChain.CorefMention mention : chain.getValue().getMentionsInTextualOrder()) {
                CoreMap kbpMention = null;
                for (int i = mention.startIndex; i < mention.endIndex; ++i) {
                    if (mentionByStartIndex.containsKey(Pair.makePair(mention.sentNum - 1, i))) {
                        kbpMention = mentionByStartIndex.get(Pair.makePair(mention.sentNum - 1, i));
                        break;
                    }
                }
                if (firstMention == null) {
                    firstMention = kbpMention;
                }
                if (kbpMention != null) {
                    if (!mentionsMap.containsKey(firstMention)) {
                        mentionsMap.put(firstMention, new LinkedHashSet<>());
                    }
                    mentionsMap.get(firstMention).add(kbpMention);
                }
            }
        }
    }
    // (coreference acronyms)
    acronymMatch(mentions, mentionsMap);
    // (ensure valid NER tag for canonical mention)
    for (CoreMap key : new HashSet<>(mentionsMap.keySet())) {
        if (key.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null) {
            CoreMap newKey = null;
            for (CoreMap candidate : mentionsMap.get(key)) {
                if (candidate.get(CoreAnnotations.NamedEntityTagAnnotation.class) != null) {
                    newKey = candidate;
                    break;
                }
            }
            if (newKey != null) {
                mentionsMap.put(newKey, mentionsMap.remove(key));
            } else {
                // case: no mention in this chain has an NER tag.
                mentionsMap.remove(key);
            }
        }
    }
    // Propagate Entity Link
    for (Map.Entry<CoreMap, Set<CoreMap>> entry : mentionsMap.entrySet()) {
        String entityLink = entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class);
        for (CoreMap mention : entry.getValue()) {
            for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
                token.set(CoreAnnotations.WikipediaEntityAnnotation.class, entityLink);
            }
        }
    }
    // Create a canonical mention map
    Map<CoreMap, CoreMap> mentionToCanonicalMention = new HashMap<>();
    for (Map.Entry<CoreMap, Set<CoreMap>> entry : mentionsMap.entrySet()) {
        for (CoreMap mention : entry.getValue()) {
            // (set the NER tag + link to be axiomatically that of the canonical mention)
            mention.set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.getKey().get(CoreAnnotations.NamedEntityTagAnnotation.class));
            mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, entry.getKey().get(CoreAnnotations.WikipediaEntityAnnotation.class));
            // (add the mention (note: this must come after we set the NER!)
            mentionToCanonicalMention.put(mention, entry.getKey());
        }
    }
    // (add missing mentions)
    mentions.stream().filter(mention -> mentionToCanonicalMention.get(mention) == null).forEach(mention -> mentionToCanonicalMention.put(mention, mention));
    // Cluster mentions by sentence
    @SuppressWarnings("unchecked") List<CoreMap>[] mentionsBySentence = new List[annotation.get(CoreAnnotations.SentencesAnnotation.class).size()];
    for (int i = 0; i < mentionsBySentence.length; ++i) {
        mentionsBySentence[i] = new ArrayList<>();
    }
    for (CoreMap mention : mentionToCanonicalMention.keySet()) {
        mentionsBySentence[mention.get(CoreAnnotations.SentenceIndexAnnotation.class)].add(mention);
    }
    // Classify
    for (int sentenceI = 0; sentenceI < mentionsBySentence.length; ++sentenceI) {
        // the annotations
        List<RelationTriple> triples = new ArrayList<>();
        List<CoreMap> candidates = mentionsBySentence[sentenceI];
        // determine sentence length
        int sentenceLength = annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).get(CoreAnnotations.TokensAnnotation.class).size();
        // check if sentence is too long, if it's too long don't run kbp
        if (maxLength != -1 && sentenceLength > maxLength) {
            // set the triples annotation to an empty list of RelationTriples
            annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(CoreAnnotations.KBPTriplesAnnotation.class, triples);
            // continue to next sentence
            continue;
        }
        // sentence isn't too long, so continue processing this sentence
        for (int subjI = 0; subjI < candidates.size(); ++subjI) {
            CoreMap subj = candidates.get(subjI);
            int subjBegin = subj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
            int subjEnd = subj.get(CoreAnnotations.TokensAnnotation.class).get(subj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
            Optional<KBPRelationExtractor.NERTag> subjNER = KBPRelationExtractor.NERTag.fromString(subj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
            if (subjNER.isPresent()) {
                for (int objI = 0; objI < candidates.size(); ++objI) {
                    if (subjI == objI) {
                        continue;
                    }
                    if (Thread.interrupted()) {
                        throw new RuntimeInterruptedException();
                    }
                    CoreMap obj = candidates.get(objI);
                    int objBegin = obj.get(CoreAnnotations.TokensAnnotation.class).get(0).index() - 1;
                    int objEnd = obj.get(CoreAnnotations.TokensAnnotation.class).get(obj.get(CoreAnnotations.TokensAnnotation.class).size() - 1).index();
                    Optional<KBPRelationExtractor.NERTag> objNER = KBPRelationExtractor.NERTag.fromString(obj.get(CoreAnnotations.NamedEntityTagAnnotation.class));
                    if (objNER.isPresent() && KBPRelationExtractor.RelationType.plausiblyHasRelation(subjNER.get(), objNER.get())) {
                        // type check
                        KBPRelationExtractor.KBPInput input = new KBPRelationExtractor.KBPInput(new Span(subjBegin, subjEnd), new Span(objBegin, objEnd), subjNER.get(), objNER.get(), doc.sentence(sentenceI));
                        //  -- BEGIN Classify
                        Pair<String, Double> prediction = extractor.classify(input);
                        // Handle the classifier output
                        if (!KBPStatisticalExtractor.NO_RELATION.equals(prediction.first)) {
                            RelationTriple triple = new RelationTriple.WithLink(subj.get(CoreAnnotations.TokensAnnotation.class), mentionToCanonicalMention.get(subj).get(CoreAnnotations.TokensAnnotation.class), Collections.singletonList(new CoreLabel(new Word(prediction.first))), obj.get(CoreAnnotations.TokensAnnotation.class), mentionToCanonicalMention.get(obj).get(CoreAnnotations.TokensAnnotation.class), prediction.second, sentences.get(sentenceI).get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class), subj.get(CoreAnnotations.WikipediaEntityAnnotation.class), obj.get(CoreAnnotations.WikipediaEntityAnnotation.class));
                            triples.add(triple);
                        }
                    }
                }
            }
        }
        // Set triples
        annotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceI).set(CoreAnnotations.KBPTriplesAnnotation.class, triples);
    }
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CorefChain(edu.stanford.nlp.coref.data.CorefChain) IOUtils(edu.stanford.nlp.io.IOUtils) edu.stanford.nlp.util(edu.stanford.nlp.util) Redwood(edu.stanford.nlp.util.logging.Redwood) IOException(java.io.IOException) Document(edu.stanford.nlp.simple.Document) Collectors(java.util.stream.Collectors) LinearClassifier(edu.stanford.nlp.classify.LinearClassifier) Classifier(edu.stanford.nlp.classify.Classifier) Span(edu.stanford.nlp.ie.machinereading.structure.Span) edu.stanford.nlp.ie(edu.stanford.nlp.ie) Word(edu.stanford.nlp.ling.Word) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) WordLists(edu.stanford.nlp.coref.data.WordLists) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations) Word(edu.stanford.nlp.ling.Word) Document(edu.stanford.nlp.simple.Document) Span(edu.stanford.nlp.ie.machinereading.structure.Span) CorefChain(edu.stanford.nlp.coref.data.CorefChain) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CorefCoreAnnotations(edu.stanford.nlp.coref.CorefCoreAnnotations)

Aggregations

RelationTriple (edu.stanford.nlp.ie.util.RelationTriple)20 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)13 CoreLabel (edu.stanford.nlp.ling.CoreLabel)13 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)12 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)10 CorefCoreAnnotations (edu.stanford.nlp.coref.CorefCoreAnnotations)8 CorefChain (edu.stanford.nlp.coref.data.CorefChain)8 Tree (edu.stanford.nlp.trees.Tree)8 java.util (java.util)7 Span (edu.stanford.nlp.ie.machinereading.structure.Span)6 SentimentCoreAnnotations (edu.stanford.nlp.sentiment.SentimentCoreAnnotations)6 edu.stanford.nlp.util (edu.stanford.nlp.util)6 CoreMap (edu.stanford.nlp.util.CoreMap)6 Collectors (java.util.stream.Collectors)6 EntityMention (edu.stanford.nlp.ie.machinereading.structure.EntityMention)5 RelationMention (edu.stanford.nlp.ie.machinereading.structure.RelationMention)5 IndexedWord (edu.stanford.nlp.ling.IndexedWord)5 Annotation (edu.stanford.nlp.pipeline.Annotation)5 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)5 RNNCoreAnnotations (edu.stanford.nlp.neural.rnn.RNNCoreAnnotations)4