Search in sources :

Example 6 with SemgrexPattern

use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.

the class ScorePhrases method runParallelApplyPats.

private void runParallelApplyPats(Map<String, DataInstance> sents, String label, E pattern, TwoDimensionalCounter<CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat, Set<CandidatePhrase> alreadyLabeledWords) {
    Redwood.log(Redwood.DBG, "Applying pattern " + pattern + " to a total of " + sents.size() + " sentences ");
    List<String> notAllowedClasses = new ArrayList<>();
    List<String> sentids = CollectionUtils.toList(sents.keySet());
    if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass) {
        for (String l : constVars.getAnswerClass().keySet()) {
            if (!l.equals(label)) {
                notAllowedClasses.add(l);
            }
        }
        notAllowedClasses.add("OTHERSEM");
    }
    Map<TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
    Map<SemgrexPattern, E> depPatternsLearnedThisIterConverted = null;
    if (constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) {
        surfacePatternsLearnedThisIterConverted = new HashMap<>();
        String patternStr = null;
        try {
            patternStr = pattern.toString(notAllowedClasses);
            TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
            surfacePatternsLearnedThisIterConverted.put(pat, pattern);
        } catch (Exception e) {
            log.info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
            throw e;
        }
    } else if (constVars.patternType.equals(PatternFactory.PatternType.DEP)) {
        depPatternsLearnedThisIterConverted = new HashMap<>();
        SemgrexPattern pat = SemgrexPattern.compile(pattern.toString(notAllowedClasses), new edu.stanford.nlp.semgraph.semgrex.Env(constVars.env.get(label).getVariables()));
        depPatternsLearnedThisIterConverted.put(pat, pattern);
    } else {
        throw new UnsupportedOperationException();
    }
    // Apply the patterns and extract candidate phrases
    int num;
    int numThreads = constVars.numThreads;
    // If number of sentences is less, do not create so many threads
    if (sents.size() < 50)
        numThreads = 1;
    if (numThreads == 1)
        num = sents.size();
    else
        num = sents.size() / (numThreads - 1);
    ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
    List<Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>>> list = new ArrayList<>();
    for (int i = 0; i < numThreads; i++) {
        Callable<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> task = null;
        if (pattern.type.equals(PatternFactory.PatternType.SURFACE))
            // Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
            task = new ApplyPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num, Math.min(sentids.size(), (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords, constVars);
        else
            task = new ApplyDepPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num, Math.min(sentids.size(), (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords, constVars);
        Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> submit = executor.submit(task);
        list.add(submit);
    }
    // Now retrieve the result
    for (Future<Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>>> future : list) {
        try {
            Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> result = future.get();
            Redwood.log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.first());
            wordsandLemmaPatExtracted.addAll(result.first());
            matchedTokensByPat.addAll(result.second());
            alreadyLabeledWords.addAll(result.third());
        } catch (Exception e) {
            executor.shutdownNow();
            throw new RuntimeException(e);
        }
    }
    executor.shutdown();
}
Also used : Env(edu.stanford.nlp.ling.tokensregex.Env) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) IOException(java.io.IOException) InvocationTargetException(java.lang.reflect.InvocationTargetException) ApplyDepPatterns(edu.stanford.nlp.patterns.dep.ApplyDepPatterns) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 7 with SemgrexPattern

use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.

the class Mention method findDependentVerb.

private static Pair<IndexedWord, String> findDependentVerb(Mention m) {
    if (m.dependency.getRoots().size() == 0) {
        return new Pair<>();
    }
    // would be nice to condense this pattern, but sadly =reln
    // always uses the last relation in the sequence, not the first
    SemgrexPattern pattern = SemgrexPattern.compile("{idx:" + (m.headIndex + 1) + "} [ <=reln {tag:/^V.*/}=verb | <=reln ({} << {tag:/^V.*/}=verb) ]");
    SemgrexMatcher matcher = pattern.matcher(m.dependency);
    while (matcher.find()) {
        return Pair.makePair(matcher.getNode("verb"), matcher.getRelnString("reln"));
    }
    return new Pair<>();
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)

Example 8 with SemgrexPattern

use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.

the class Ssurgeon method ssurgeonPatternFromXML.

/**
 * Given the root Element for a SemgrexPattern (SSURGEON_ELEM_TAG), converts
 * it into its corresponding SemgrexPattern object.
 * @throws Exception
 */
@SuppressWarnings("unchecked")
public static SsurgeonPattern ssurgeonPatternFromXML(Element elt) throws Exception {
    String uid = getTagText(elt, SsurgeonPattern.UID_ELEM_TAG);
    String notes = getTagText(elt, SsurgeonPattern.NOTES_ELEM_TAG);
    String semgrexString = getTagText(elt, SsurgeonPattern.SEMGREX_ELEM_TAG);
    SemgrexPattern semgrexPattern = SemgrexPattern.compile(semgrexString);
    SsurgeonPattern retPattern = new SsurgeonPattern(uid, semgrexPattern);
    retPattern.setNotes(notes);
    NodeList editNodes = elt.getElementsByTagName(SsurgeonPattern.EDIT_LIST_ELEM_TAG);
    for (int i = 0; i < editNodes.getLength(); i++) {
        Node node = editNodes.item(i);
        if (node.getNodeType() == Node.ELEMENT_NODE) {
            Element editElt = (Element) node;
            String editVal = getEltText(editElt);
            retPattern.addEdit(Ssurgeon.parseEditLine(editVal));
        }
    }
    // If predicate available, parse
    Element predElt = getFirstTag(elt, SsurgeonPattern.PREDICATE_TAG);
    if (predElt != null) {
        SsurgPred pred = assemblePredFromXML(getFirstChildElement(predElt));
        retPattern.setPredicate(pred);
    }
    return retPattern;
}
Also used : SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) Element(org.w3c.dom.Element)

Example 9 with SemgrexPattern

use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.

the class UniversalEnglishGrammaticalStructure method processNames.

/**
 * Looks for NPs that should have the {@code name} relation and
 * a) changes the structure such that the leftmost token becomes the head
 * b) changes the relation from {@code compound} to {@code name}.
 *
 * Requires NER tags.
 *
 * @param sg A semantic graph.
 */
private static void processNames(SemanticGraph sg) {
    if (!USE_NAME) {
        return;
    }
    /* Semgrexes require a graph with a root. */
    if (sg.getRoots().isEmpty()) {
        return;
    }
    // check whether NER tags are available
    IndexedWord rootToken = sg.getFirstRoot();
    if (rootToken == null || !rootToken.containsKey(CoreAnnotations.NamedEntityTagAnnotation.class)) {
        return;
    }
    SemanticGraph sgCopy = sg.makeSoftCopy();
    for (SemgrexPattern pattern : NAME_PATTERNS) {
        SemgrexMatcher matcher = pattern.matcher(sgCopy);
        List<IndexedWord> nameParts = new ArrayList<>();
        IndexedWord head = null;
        while (matcher.find()) {
            IndexedWord w1 = matcher.getNode("w1");
            IndexedWord w2 = matcher.getNode("w2");
            if (head != w1) {
                if (head != null) {
                    processNamesHelper(sg, head, nameParts);
                    nameParts = new ArrayList<>();
                }
                head = w1;
            }
            if (w2.ner().equals(w1.ner())) {
                nameParts.add(w2);
            }
        }
        if (head != null) {
            processNamesHelper(sg, head, nameParts);
            sgCopy = sg.makeSoftCopy();
        }
    }
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 10 with SemgrexPattern

use of edu.stanford.nlp.semgraph.semgrex.SemgrexPattern in project CoreNLP by stanfordnlp.

the class CreateClauseDataset method subjectObjectPairs.

/**
 * Create a dataset of subject/object pairs, such that a sequence of splits that segments this
 * subject and object is a correct sequence.
 *
 * @param depparse The dependency parse of the sentence.
 * @param traceTargets The set of spans corresponding to targets of traces.
 * @param traceSources The set of indices in a sentence corresponding to the sources of traces.
 * @return A dataset of subject/object spans.
 */
@SuppressWarnings("UnusedParameters")
private static Collection<Pair<Span, Span>> subjectObjectPairs(SemanticGraph depparse, List<CoreLabel> tokens, Map<Integer, Span> traceTargets, Map<Integer, Integer> traceSources) {
    // log(StringUtils.join(tokens.stream().map(CoreLabel::word), " "));
    List<Pair<Span, Span>> data = new ArrayList<>();
    for (SemgrexPattern vpPattern : segmenter.VP_PATTERNS) {
        SemgrexMatcher matcher = vpPattern.matcher(depparse);
        while (matcher.find()) {
            // Get the verb and object
            IndexedWord verb = matcher.getNode("verb");
            IndexedWord object = matcher.getNode("object");
            if (verb != null && object != null) {
                // See if there is already a subject attached
                boolean hasSubject = false;
                for (SemanticGraphEdge edge : depparse.outgoingEdgeIterable(verb)) {
                    if (edge.getRelation().toString().contains("subj")) {
                        hasSubject = true;
                    }
                }
                for (SemanticGraphEdge edge : depparse.outgoingEdgeIterable(object)) {
                    if (edge.getRelation().toString().contains("subj")) {
                        hasSubject = true;
                    }
                }
                if (!hasSubject) {
                    // Get the spans for the verb and object
                    Optional<List<IndexedWord>> verbChunk = segmenter.getValidChunk(depparse, verb, segmenter.VALID_ADVERB_ARCS, Optional.empty(), true);
                    Optional<List<IndexedWord>> objectChunk = segmenter.getValidChunk(depparse, object, segmenter.VALID_OBJECT_ARCS, Optional.empty(), true);
                    if (verbChunk.isPresent() && objectChunk.isPresent()) {
                        verbChunk.get().sort(Comparator.comparingInt(IndexedWord::index));
                        objectChunk.get().sort(Comparator.comparingInt(IndexedWord::index));
                        // Find a trace
                        int traceId = -1;
                        Span verbSpan = toSpan(verbChunk.get());
                        Span traceSpan = Span.fromValues(verbSpan.start() - 1, verbSpan.end() + 1);
                        for (Map.Entry<Integer, Integer> entry : traceSources.entrySet()) {
                            if (traceSpan.contains(entry.getValue())) {
                                traceId = entry.getKey();
                            }
                        }
                        // noinspection StatementWithEmptyBody
                        if (traceId < 0) {
                        // Register the VP as an unknown VP
                        // List<CoreLabel> vpChunk = new ArrayList<>();
                        // vpChunk.addAll(verbChunk.get());
                        // vpChunk.addAll(objectChunk.get());
                        // Collections.sort(vpChunk, (a, b) -> a.index() - b.index());
                        // debug("could not find trace for " + vpChunk);
                        } else {
                            // Add the obj chunk
                            Span subjectSpan = traceTargets.get(traceId);
                            Span objectSpan = toSpan(objectChunk.get());
                            if (subjectSpan != null) {
                                // debug("(" +
                                // StringUtils.join(tokens.subList(subjectSpan.start(), subjectSpan.end()).stream().map(CoreLabel::word), " ") + "; " +
                                // verb.word() + "; " +
                                // StringUtils.join(tokens.subList(objectSpan.start(), objectSpan.end()).stream().map(CoreLabel::word), " ") +
                                // ")");
                                data.add(Pair.makePair(subjectSpan, objectSpan));
                            }
                        }
                    }
                }
            }
        }
    }
    // Run vanilla pattern splits
    for (SemgrexPattern vpPattern : segmenter.VERB_PATTERNS) {
        SemgrexMatcher matcher = vpPattern.matcher(depparse);
        while (matcher.find()) {
            // Get the verb and object
            IndexedWord subject = matcher.getNode("subject");
            IndexedWord object = matcher.getNode("object");
            if (subject != null && object != null) {
                Optional<List<IndexedWord>> subjectChunk = segmenter.getValidChunk(depparse, subject, segmenter.VALID_SUBJECT_ARCS, Optional.empty(), true);
                Optional<List<IndexedWord>> objectChunk = segmenter.getValidChunk(depparse, object, segmenter.VALID_OBJECT_ARCS, Optional.empty(), true);
                if (subjectChunk.isPresent() && objectChunk.isPresent()) {
                    Span subjectSpan = toSpan(subjectChunk.get());
                    Span objectSpan = toSpan(objectChunk.get());
                    data.add(Pair.makePair(subjectSpan, objectSpan));
                }
            }
        }
    }
    return data;
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) Span(edu.stanford.nlp.ie.machinereading.structure.Span) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Aggregations

SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)21 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)12 SemgrexMatcher (edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher)12 IndexedWord (edu.stanford.nlp.ling.IndexedWord)11 CoreLabel (edu.stanford.nlp.ling.CoreLabel)6 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)5 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)3 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)3 Span (edu.stanford.nlp.ie.machinereading.structure.Span)2 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)2 CandidatePhrase (edu.stanford.nlp.patterns.CandidatePhrase)2 DataInstance (edu.stanford.nlp.patterns.DataInstance)2 Pattern (edu.stanford.nlp.patterns.Pattern)2 PatternsAnnotations (edu.stanford.nlp.patterns.PatternsAnnotations)2 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)2 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)2 IntPair (edu.stanford.nlp.util.IntPair)2 Pair (edu.stanford.nlp.util.Pair)2 Triple (edu.stanford.nlp.util.Triple)2 ArrayList (java.util.ArrayList)2