Search in sources :

Example 11 with SemgrexMatcher

use of edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher in project CoreNLP by stanfordnlp.

the class RelationTripleSegmenter method extract.

/**
   * Extract the nominal patterns from this sentence.
   *
   * @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
   * @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
   *
   * @param parse The parse tree of the sentence to annotate.
   * @param tokens The tokens of the sentence to annotate.
   * @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
   */
@SuppressWarnings("unchecked")
public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
    List<RelationTriple> extractions = new ArrayList<>();
    Set<Triple<Span, String, Span>> alreadyExtracted = new HashSet<>();
    //
    for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
        TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
        while (tokenMatcher.find()) {
            boolean missingPrefixBe;
            boolean missingSuffixOf = false;
            // Create subject
            List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
            Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
            List<CoreLabel> subjectTokens = new ArrayList<>();
            for (int i : subjectSpan) {
                subjectTokens.add(tokens.get(i));
            }
            // Create object
            List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
            Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
            if (Span.overlaps(subjectSpan, objectSpan)) {
                continue;
            }
            List<CoreLabel> objectTokens = new ArrayList<>();
            for (int i : objectSpan) {
                objectTokens.add(tokens.get(i));
            }
            // Create relation
            if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
                List<CoreLabel> relationTokens = new ArrayList<>();
                // (add the 'be')
                missingPrefixBe = true;
                // (add a complement to the 'be')
                List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
                if (beofComp != null) {
                    // (add the complement
                    for (CoreMap token : beofComp) {
                        if (token instanceof CoreLabel) {
                            relationTokens.add((CoreLabel) token);
                        } else {
                            relationTokens.add(new CoreLabel(token));
                        }
                    }
                    // (add the 'of')
                    missingSuffixOf = true;
                }
                // Add extraction
                String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
                if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
                    RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
                    //noinspection ConstantConditions
                    extraction.isPrefixBe(missingPrefixBe);
                    extraction.isSuffixOf(missingSuffixOf);
                    extractions.add(extraction);
                    alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
                }
            }
        }
        //
        for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
            SemgrexMatcher matcher = semgrex.matcher(parse);
            while (matcher.find()) {
                boolean missingPrefixBe = false;
                boolean missingSuffixBe = false;
                boolean istmod = false;
                // Get relaux if applicable
                String relaux = matcher.getRelnString("relaux");
                String ignoredArc = relaux;
                if (ignoredArc == null) {
                    ignoredArc = matcher.getRelnString("arc");
                }
                // Create subject
                IndexedWord subject = matcher.getNode("subject");
                List<IndexedWord> subjectTokens = new ArrayList<>();
                Span subjectSpan;
                if (subject.ner() != null && !"O".equals(subject.ner())) {
                    subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
                    for (int i : subjectSpan) {
                        subjectTokens.add(new IndexedWord(tokens.get(i)));
                    }
                } else {
                    subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
                    subjectSpan = Util.tokensToSpan(subjectTokens);
                }
                // Create object
                IndexedWord object = matcher.getNode("object");
                List<IndexedWord> objectTokens = new ArrayList<>();
                Span objectSpan;
                if (object.ner() != null && !"O".equals(object.ner())) {
                    objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
                    for (int i : objectSpan) {
                        objectTokens.add(new IndexedWord(tokens.get(i)));
                    }
                } else {
                    objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
                    objectSpan = Util.tokensToSpan(objectTokens);
                }
                // Check that the pair is valid
                if (Span.overlaps(subjectSpan, objectSpan)) {
                    // We extracted an identity
                    continue;
                }
                if (subjectSpan.end() == objectSpan.start() - 1 && (tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(subjectSpan.end()).tag()))) {
                    // We're straddling a clause
                    continue;
                }
                if (objectSpan.end() == subjectSpan.start() - 1 && (tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(objectSpan.end()).tag()))) {
                    // We're straddling a clause
                    continue;
                }
                // Get any prepositional edges
                String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
                IndexedWord prepWord = null;
                // (these usually come from the object)
                boolean prepositionIsPrefix = false;
                for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
                    if (edge.getRelation().toString().equals("case")) {
                        prepWord = edge.getDependent();
                    }
                }
                // (...but sometimes from the subject)
                if (prepWord == null) {
                    for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
                        if (edge.getRelation().toString().equals("case")) {
                            prepositionIsPrefix = true;
                            prepWord = edge.getDependent();
                        }
                    }
                }
                List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
                if (prepWord != null && !expected.equals("tmod")) {
                    Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
                    if (!optionalPrepChunk.isPresent()) {
                        continue;
                    }
                    prepChunk = optionalPrepChunk.get();
                    Collections.sort(prepChunk, (a, b) -> {
                        double val = a.pseudoPosition() - b.pseudoPosition();
                        if (val < 0) {
                            return -1;
                        }
                        if (val > 0) {
                            return 1;
                        } else {
                            return 0;
                        }
                    });
                // ascending sort
                }
                // Get the relation
                if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
                    LinkedList<IndexedWord> relationTokens = new LinkedList<>();
                    IndexedWord relNode = matcher.getNode("relation");
                    if (relNode != null) {
                        // Case: we have a grounded relation span
                        // (add the relation)
                        relationTokens.add(relNode);
                        // (add any prepositional case markings)
                        if (prepositionIsPrefix) {
                            // We're almost certainly missing a suffix 'be'
                            missingSuffixBe = true;
                            for (int i = prepChunk.size() - 1; i >= 0; --i) {
                                relationTokens.addFirst(prepChunk.get(i));
                            }
                        } else {
                            relationTokens.addAll(prepChunk);
                        }
                        if (expected.equalsIgnoreCase("tmod")) {
                            istmod = true;
                        }
                    } else {
                        // (mark it as missing a preceding 'be'
                        if (!expected.equals("poss")) {
                            missingPrefixBe = true;
                        }
                        // (add any prepositional case markings)
                        if (prepositionIsPrefix) {
                            for (int i = prepChunk.size() - 1; i >= 0; --i) {
                                relationTokens.addFirst(prepChunk.get(i));
                            }
                        } else {
                            relationTokens.addAll(prepChunk);
                        }
                        if (expected.equalsIgnoreCase("tmod")) {
                            istmod = true;
                        }
                        // (some fine-tuning)
                        if (allowNominalsWithoutNER && "of".equals(expected)) {
                            // prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
                            continue;
                        }
                    }
                    // Add extraction
                    String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
                    if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
                        RelationTriple extraction = new RelationTriple(subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
                        extraction.istmod(istmod);
                        extraction.isPrefixBe(missingPrefixBe);
                        extraction.isSuffixBe(missingSuffixBe);
                        extractions.add(extraction);
                        alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
                    }
                }
            }
        }
    }
    //
    // Filter downward polarity extractions
    //
    Iterator<RelationTriple> iter = extractions.iterator();
    while (iter.hasNext()) {
        RelationTriple term = iter.next();
        boolean shouldRemove = true;
        for (CoreLabel token : term) {
            if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null || !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
                shouldRemove = false;
            }
        }
        if (shouldRemove) {
            // Don't extract things in downward polarity contexts.
            iter.remove();
        }
    }
    // Return
    return extractions;
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) Span(edu.stanford.nlp.ie.machinereading.structure.Span) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 12 with SemgrexMatcher

use of edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher in project CoreNLP by stanfordnlp.

the class NaturalLogicAnnotator method annotateUnaries.

/**
   * Annotate any unary quantifiers that weren't found in the main {@link NaturalLogicAnnotator#annotateOperators(CoreMap)} method.
   * @param sentence The sentence to annotate.
   */
private static void annotateUnaries(CoreMap sentence) {
    // Get tree and tokens
    SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
    if (tree == null) {
        tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    }
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    // Get operator exists mask
    boolean[] isOperator = new boolean[tokens.size()];
    for (int i = 0; i < isOperator.length; ++i) {
        OperatorSpec spec = tokens.get(i).get(OperatorAnnotation.class);
        if (spec != null) {
            for (int k = spec.quantifierBegin; k < spec.quantifierEnd; ++k) {
                isOperator[k] = true;
            }
        }
    }
    // Match Semgrex
    SemgrexMatcher matcher = UNARY_PATTERN.matcher(tree);
    while (matcher.find()) {
        // Get relevant nodes
        IndexedWord quantifier = matcher.getNode("quantifier");
        String word = quantifier.word().toLowerCase();
        if (word.equals("a") || word.equals("an") || word.equals("the") || "CD".equals(quantifier.tag())) {
            // These are absurdly common, and uninformative, and we're just going to shoot ourselves in the foot from parsing errors and idiomatic expressions.
            continue;
        }
        IndexedWord subject = matcher.getNode("subject");
        // ... If there is not already an operator there
        if (!isOperator[quantifier.index() - 1]) {
            Optional<Triple<Operator, Integer, Integer>> quantifierInfo = validateQuantifierByHead(sentence, quantifier);
            // ... and if we found a quantifier span
            if (quantifierInfo.isPresent()) {
                // Then add the unary operator!
                OperatorSpec scope = computeScope(tree, quantifierInfo.get().first, subject, Pair.makePair(quantifierInfo.get().second, quantifierInfo.get().third), null, false, null, tokens.size());
                CoreLabel token = tokens.get(quantifier.index() - 1);
                token.set(OperatorAnnotation.class, scope);
            }
        }
    }
    // Match TokensRegex
    TokenSequenceMatcher tokenMatcher = DOUBT_PATTERN.matcher(tokens);
    while (tokenMatcher.find()) {
        List<CoreLabel> doubt = (List<CoreLabel>) tokenMatcher.groupNodes("$doubt");
        List<CoreLabel> target = (List<CoreLabel>) tokenMatcher.groupNodes("$target");
        for (CoreLabel word : doubt) {
            OperatorSpec spec = new OperatorSpec(Operator.GENERAL_NEG_POLARITY, word.index() - 1, word.index(), target.get(0).index() - 1, target.get(target.size() - 1).index(), 0, 0, tokens.size());
            word.set(OperatorAnnotation.class, spec);
        }
    }
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 13 with SemgrexMatcher

use of edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher in project CoreNLP by stanfordnlp.

the class NaturalLogicAnnotator method annotateOperators.

/**
   * Find the operators in this sentence, annotating the head word (only!) of each operator with the
   * {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotations.OperatorAnnotation}.
   *
   * @param sentence As in {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotator#doOneSentence(edu.stanford.nlp.pipeline.Annotation, edu.stanford.nlp.util.CoreMap)}
   */
private void annotateOperators(CoreMap sentence) {
    SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if (tree == null) {
        tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    }
    for (SemgrexPattern pattern : PATTERNS) {
        SemgrexMatcher matcher = pattern.matcher(tree);
        while (matcher.find()) {
            // Get terms
            IndexedWord properSubject = matcher.getNode("Subject");
            IndexedWord quantifier, subject;
            boolean namedEntityQuantifier = false;
            if (properSubject != null) {
                quantifier = subject = properSubject;
                namedEntityQuantifier = true;
            } else {
                quantifier = matcher.getNode("quantifier");
                subject = matcher.getNode("subject");
            }
            // Validate quantifier
            // At the end of this
            Optional<Triple<Operator, Integer, Integer>> quantifierInfo;
            if (namedEntityQuantifier) {
                // named entities have the "all" semantics by default.
                if (!neQuantifiers) {
                    continue;
                }
                // note: empty quantifier span given
                quantifierInfo = Optional.of(Triple.makeTriple(Operator.IMPLICIT_NAMED_ENTITY, quantifier.index(), quantifier.index()));
            } else {
                // find the quantifier, and return some info about it.
                quantifierInfo = validateQuantifierByHead(sentence, quantifier);
            }
            // (fix up 'there are')
            if ("be".equals(subject == null ? null : subject.lemma())) {
                boolean hasExpl = false;
                IndexedWord newSubject = null;
                for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(subject)) {
                    if ("nsubj".equals(outgoingEdge.getRelation().toString())) {
                        newSubject = outgoingEdge.getDependent();
                    } else if ("expl".equals(outgoingEdge.getRelation().toString())) {
                        hasExpl = true;
                    }
                }
                if (hasExpl) {
                    subject = newSubject;
                }
            }
            // (fix up '$n$ of')
            if ("CD".equals(subject == null ? null : subject.tag())) {
                for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(subject)) {
                    String rel = outgoingEdge.getRelation().toString();
                    if (rel.startsWith("nmod")) {
                        subject = outgoingEdge.getDependent();
                    }
                }
            }
            // Set tokens
            if (quantifierInfo.isPresent()) {
                // Compute span
                OperatorSpec scope = computeScope(tree, quantifierInfo.get().first, matcher.getNode("pivot"), Pair.makePair(quantifierInfo.get().second, quantifierInfo.get().third), subject, namedEntityQuantifier, matcher.getNode("object"), tokens.size());
                // Set annotation
                CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(quantifier.index() - 1);
                OperatorSpec oldScope = token.get(OperatorAnnotation.class);
                if (oldScope == null || oldScope.quantifierLength() < scope.quantifierLength() || oldScope.instance != scope.instance) {
                    token.set(OperatorAnnotation.class, scope);
                } else {
                    token.set(OperatorAnnotation.class, OperatorSpec.merge(oldScope, scope));
                }
            }
        }
    }
    // Ensure we didn't select overlapping quantifiers. For example, "a" and "a few" can often overlap.
    // In these cases, take the longer quantifier match.
    List<OperatorSpec> quantifiers = new ArrayList<>();
    sentence.get(CoreAnnotations.TokensAnnotation.class).stream().filter(token -> token.containsKey(OperatorAnnotation.class)).forEach(token -> quantifiers.add(token.get(OperatorAnnotation.class)));
    quantifiers.sort((x, y) -> y.quantifierLength() - x.quantifierLength());
    for (OperatorSpec quantifier : quantifiers) {
        for (int i = quantifier.quantifierBegin; i < quantifier.quantifierEnd; ++i) {
            if (i != quantifier.quantifierHead) {
                tokens.get(i).remove(OperatorAnnotation.class);
            }
        }
    }
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) CoreLabel(edu.stanford.nlp.ling.CoreLabel) java.util(java.util) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) Redwood(edu.stanford.nlp.util.logging.Redwood) edu.stanford.nlp.util(edu.stanford.nlp.util) SentenceAnnotator(edu.stanford.nlp.pipeline.SentenceAnnotator) SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) NaturalLogicAnnotations(edu.stanford.nlp.naturalli.NaturalLogicAnnotations) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) Span(edu.stanford.nlp.ie.machinereading.structure.Span) CoreAnnotation(edu.stanford.nlp.ling.CoreAnnotation) Annotation(edu.stanford.nlp.pipeline.Annotation) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) IndexedWord(edu.stanford.nlp.ling.IndexedWord) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 14 with SemgrexMatcher

use of edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher in project CoreNLP by stanfordnlp.

the class KBPSemgrexExtractor method matches.

/**
   * Returns whether any of the given patterns match this tree.
   */
private boolean matches(CoreMap sentence, Collection<SemgrexPattern> rulesForRel, KBPInput input, SemanticGraph graph) {
    if (graph == null || graph.isEmpty()) {
        return false;
    }
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    for (int i : input.subjectSpan) {
        if ("O".equals(tokens.get(i).ner())) {
            tokens.get(i).setNER(input.subjectType.name);
        }
    }
    for (int i : input.objectSpan) {
        if ("O".equals(tokens.get(i).ner())) {
            tokens.get(i).setNER(input.objectType.name);
        }
    }
    for (SemgrexPattern p : rulesForRel) {
        try {
            SemgrexMatcher n = p.matcher(graph);
            while (n.find()) {
                IndexedWord entity = n.getNode("entity");
                IndexedWord slot = n.getNode("slot");
                boolean hasSubject = entity.index() >= input.subjectSpan.start() + 1 && entity.index() <= input.subjectSpan.end();
                boolean hasObject = slot.index() >= input.objectSpan.start() + 1 && slot.index() <= input.objectSpan.end();
                if (hasSubject && hasObject) {
                    return true;
                }
            }
        } catch (Exception e) {
            //Happens when graph has no roots
            return false;
        }
    }
    return false;
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) IndexedWord(edu.stanford.nlp.ling.IndexedWord) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException)

Example 15 with SemgrexMatcher

use of edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher in project CoreNLP by stanfordnlp.

the class CreateClauseDataset method subjectObjectPairs.

/**
   * Create a dataset of subject/object pairs, such that a sequence of splits that segments this
   * subject and object is a correct sequence.
   *
   * @param depparse The dependency parse of the sentence.
   * @param traceTargets The set of spans corresponding to targets of traces.
   * @param traceSources The set of indices in a sentence corresponding to the sources of traces.
   * @return A dataset of subject/object spans.
   */
@SuppressWarnings("UnusedParameters")
private static Collection<Pair<Span, Span>> subjectObjectPairs(SemanticGraph depparse, List<CoreLabel> tokens, Map<Integer, Span> traceTargets, Map<Integer, Integer> traceSources) {
    //    log(StringUtils.join(tokens.stream().map(CoreLabel::word), " "));
    List<Pair<Span, Span>> data = new ArrayList<>();
    for (SemgrexPattern vpPattern : segmenter.VP_PATTERNS) {
        SemgrexMatcher matcher = vpPattern.matcher(depparse);
        while (matcher.find()) {
            // Get the verb and object
            IndexedWord verb = matcher.getNode("verb");
            IndexedWord object = matcher.getNode("object");
            if (verb != null && object != null) {
                // See if there is already a subject attached
                boolean hasSubject = false;
                for (SemanticGraphEdge edge : depparse.outgoingEdgeIterable(verb)) {
                    if (edge.getRelation().toString().contains("subj")) {
                        hasSubject = true;
                    }
                }
                for (SemanticGraphEdge edge : depparse.outgoingEdgeIterable(object)) {
                    if (edge.getRelation().toString().contains("subj")) {
                        hasSubject = true;
                    }
                }
                if (!hasSubject) {
                    // Get the spans for the verb and object
                    Optional<List<IndexedWord>> verbChunk = segmenter.getValidChunk(depparse, verb, segmenter.VALID_ADVERB_ARCS, Optional.empty(), true);
                    Optional<List<IndexedWord>> objectChunk = segmenter.getValidChunk(depparse, object, segmenter.VALID_OBJECT_ARCS, Optional.empty(), true);
                    if (verbChunk.isPresent() && objectChunk.isPresent()) {
                        Collections.sort(verbChunk.get(), (a, b) -> a.index() - b.index());
                        Collections.sort(objectChunk.get(), (a, b) -> a.index() - b.index());
                        // Find a trace
                        int traceId = -1;
                        Span verbSpan = toSpan(verbChunk.get());
                        Span traceSpan = Span.fromValues(verbSpan.start() - 1, verbSpan.end() + 1);
                        for (Map.Entry<Integer, Integer> entry : traceSources.entrySet()) {
                            if (traceSpan.contains(entry.getValue())) {
                                traceId = entry.getKey();
                            }
                        }
                        //noinspection StatementWithEmptyBody
                        if (traceId < 0) {
                        // Register the VP as an unknown VP
                        //                List<CoreLabel> vpChunk = new ArrayList<>();
                        //                vpChunk.addAll(verbChunk.get());
                        //                vpChunk.addAll(objectChunk.get());
                        //                Collections.sort(vpChunk, (a, b) -> a.index() - b.index());
                        //                debug("could not find trace for " + vpChunk);
                        } else {
                            // Add the obj chunk
                            Span subjectSpan = traceTargets.get(traceId);
                            Span objectSpan = toSpan(objectChunk.get());
                            if (subjectSpan != null) {
                                //                  debug("(" +
                                //                      StringUtils.join(tokens.subList(subjectSpan.start(), subjectSpan.end()).stream().map(CoreLabel::word), " ") + "; " +
                                //                      verb.word() + "; " +
                                //                      StringUtils.join(tokens.subList(objectSpan.start(), objectSpan.end()).stream().map(CoreLabel::word), " ") +
                                //                      ")");
                                data.add(Pair.makePair(subjectSpan, objectSpan));
                            }
                        }
                    }
                }
            }
        }
    }
    // Run vanilla pattern splits
    for (SemgrexPattern vpPattern : segmenter.VERB_PATTERNS) {
        SemgrexMatcher matcher = vpPattern.matcher(depparse);
        while (matcher.find()) {
            // Get the verb and object
            IndexedWord subject = matcher.getNode("subject");
            IndexedWord object = matcher.getNode("object");
            if (subject != null && object != null) {
                Optional<List<IndexedWord>> subjectChunk = segmenter.getValidChunk(depparse, subject, segmenter.VALID_SUBJECT_ARCS, Optional.empty(), true);
                Optional<List<IndexedWord>> objectChunk = segmenter.getValidChunk(depparse, object, segmenter.VALID_OBJECT_ARCS, Optional.empty(), true);
                if (subjectChunk.isPresent() && objectChunk.isPresent()) {
                    Span subjectSpan = toSpan(subjectChunk.get());
                    Span objectSpan = toSpan(objectChunk.get());
                    data.add(Pair.makePair(subjectSpan, objectSpan));
                }
            }
        }
    }
    return data;
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) Span(edu.stanford.nlp.ie.machinereading.structure.Span) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Aggregations

SemgrexMatcher (edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher)23 IndexedWord (edu.stanford.nlp.ling.IndexedWord)19 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)13 SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)10 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)9 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)4 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 Span (edu.stanford.nlp.ie.machinereading.structure.Span)3 TokenSequenceMatcher (edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher)3 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)3 GrammaticalRelation (edu.stanford.nlp.trees.GrammaticalRelation)3 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)2 RelationTriple (edu.stanford.nlp.ie.util.RelationTriple)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 CoreAnnotation (edu.stanford.nlp.ling.CoreAnnotation)1 NaturalLogicAnnotations (edu.stanford.nlp.naturalli.NaturalLogicAnnotations)1 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)1 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)1 Annotation (edu.stanford.nlp.pipeline.Annotation)1 SentenceAnnotator (edu.stanford.nlp.pipeline.SentenceAnnotator)1