Search in sources :

Example 11 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class RuleBasedCorefMentionFinder method extractEnumerations.

protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
    List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
    Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    TregexPattern tgrepPattern = enumerationsMentionPattern;
    TregexMatcher matcher = tgrepPattern.matcher(tree);
    Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap();
    while (matcher.find()) {
        matcher.getMatch();
        Tree m1 = matcher.getNode("m1");
        Tree m2 = matcher.getNode("m2");
        List<Tree> mLeaves = m1.getLeaves();
        int beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
        int endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
        spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1);
        mLeaves = m2.getLeaves();
        beginIdx = ((CoreLabel) mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
        endIdx = ((CoreLabel) mLeaves.get(mLeaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
        spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2);
    }
    for (IntPair mSpan : spanToMentionSubTree.keySet()) {
        if (!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
            int dummyMentionId = -1;
            Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency, new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan));
            mentions.add(m);
            mentionSpanSet.add(mSpan);
        }
    }
}
Also used : TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) ParserConstraint(edu.stanford.nlp.parser.common.ParserConstraint) CoreLabel(edu.stanford.nlp.ling.CoreLabel) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher)

Example 12 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class Mention method setNumber.

protected void setNumber(Dictionaries dict) {
    if (mentionType == MentionType.PRONOMINAL) {
        if (dict.pluralPronouns.contains(headString)) {
            number = Number.PLURAL;
        } else if (dict.singularPronouns.contains(headString)) {
            number = Number.SINGULAR;
        } else {
            number = Number.UNKNOWN;
        }
    } else if (mentionType == MentionType.LIST) {
        number = Number.PLURAL;
    } else if (!nerString.equals("O") && mentionType != MentionType.NOMINAL) {
        // Check to see if this is a list of things
        if (!(nerString.equals("ORGANIZATION") || nerString.startsWith("ORG"))) {
            number = Number.SINGULAR;
        } else {
            // ORGs can be both plural and singular
            number = Number.UNKNOWN;
        }
    } else {
        String tag = headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        if (tag.startsWith("N") && tag.endsWith("S")) {
            number = Number.PLURAL;
        } else if (tag.startsWith("N")) {
            number = Number.SINGULAR;
        } else {
            number = Number.UNKNOWN;
        }
    }
    if (mentionType != MentionType.PRONOMINAL) {
        if (number == Number.UNKNOWN) {
            if (dict.singularWords.contains(headString)) {
                number = Number.SINGULAR;
                SieveCoreferenceSystem.logger.finest("[Bergsma] Number set to:\tSINGULAR:\t" + headString);
            } else if (dict.pluralWords.contains(headString)) {
                number = Number.PLURAL;
                SieveCoreferenceSystem.logger.finest("[Bergsma] Number set to:\tPLURAL:\t" + headString);
            }
        }
        final String enumerationPattern = "NP < (NP=tmp $.. (/,|CC/ $.. NP))";
        TregexPattern tgrepPattern = TregexPattern.compile(enumerationPattern);
        TregexMatcher m = tgrepPattern.matcher(this.mentionSubTree);
        while (m.find()) {
            //        Tree t = m.getMatch();
            if (this.mentionSubTree == m.getNode("tmp") && this.spanToString().toLowerCase().contains(" and ")) {
                number = Number.PLURAL;
            }
        }
    }
}
Also used : TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher)

Example 13 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class SplitCanditoTrees method outputSplits.

/**
   * Right now this outputs trees in PTB format.  It outputs one tree
   * at a time until we have output enough trees to fill the given
   * file, then moves on to the next file.  Trees are output in the
   * order given in the <code>ids</code> list.
   * <br>
   * Trees have their words replaced with the words' lemmas, if those
   * lemmas exist.
   */
public static void outputSplits(List<String> ids, Map<String, Tree> treeMap) throws IOException {
    Queue<Integer> fSizeQueue = new LinkedList<>(Arrays.asList(fSizes));
    Queue<String> fNameQueue = new LinkedList<>(Arrays.asList(fNames));
    TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
    TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
    final TreeTransformer tt = new FTBCorrector();
    int size = fSizeQueue.remove();
    String filename = fNameQueue.remove();
    log.info("Outputing " + filename);
    PrintWriter writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")));
    int outputCount = 0;
    for (String id : ids) {
        if (!treeMap.containsKey(id)) {
            log.info("Missing id: " + id);
            continue;
        }
        Tree tree = treeMap.get(id);
        TregexMatcher m = pBadTree.matcher(tree);
        TregexMatcher m2 = pBadTree2.matcher(tree);
        if (m.find() || m2.find()) {
            log.info("Discarding tree: " + tree.toString());
            continue;
        }
        // Punctuation normalization, etc.
        Tree backupCopy = tree.deepCopy();
        tree = tt.transformTree(tree);
        if (tree.firstChild().children().length == 0) {
            // Some trees have only punctuation. Tregex will mangle these. Don't throw those away.
            log.info("Saving tree: " + tree.toString());
            log.info("Backup: " + backupCopy.toString());
            tree = backupCopy;
        }
        if (LEMMAS_AS_LEAVES || ADD_MORPHO_TO_LEAVES) {
            mungeLeaves(tree, LEMMAS_AS_LEAVES, ADD_MORPHO_TO_LEAVES);
        }
        if (CC_TAGSET) {
            replacePOSTags(tree);
        }
        if (MORFETTE_OUTPUT) {
            writer.println(treeToMorfette(tree));
        } else {
            writer.println(tree.toString());
        }
        ++outputCount;
        if (outputCount == size) {
            outputCount = 0;
            size = fSizeQueue.remove();
            filename = fNameQueue.remove();
            log.info("Outputing " + filename);
            writer.close();
            writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "UTF-8")));
        }
    }
    writer.close();
}
Also used : TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) LinkedList(java.util.LinkedList) BufferedWriter(java.io.BufferedWriter) FileOutputStream(java.io.FileOutputStream) Tree(edu.stanford.nlp.trees.Tree) OutputStreamWriter(java.io.OutputStreamWriter) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) FTBCorrector(edu.stanford.nlp.international.french.pipeline.FTBCorrector) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) PrintWriter(java.io.PrintWriter)

Example 14 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class MWEFrequencyDist method main.

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
    Set<String> uniquePOSSequences = Generics.newHashSet();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        final TregexPattern pMWE = TregexPattern.compile("/^MW/");
        for (Tree t; (t = tr.readTree()) != null; ) {
            //Count MWE statistics
            TregexMatcher m = pMWE.matcher(t);
            while (m.findNextMatchingNode()) {
                Tree match = m.getMatch();
                String label = match.value();
                List<CoreLabel> yield = match.taggedLabeledYield();
                StringBuilder termYield = new StringBuilder();
                StringBuilder posYield = new StringBuilder();
                for (CoreLabel cl : yield) {
                    termYield.append(cl.word()).append(" ");
                    posYield.append(cl.tag()).append(" ");
                }
                mweLabelToString.incrementCount(label, termYield.toString().trim());
                uniquePOSSequences.add(posYield.toString().trim());
            }
        }
        //Closes the underlying reader
        tr.close();
        System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
        double nMWEs = mweLabelToString.totalCount();
        int nAllSingletons = 0;
        int nTokens = 0;
        for (String mweLabel : mweLabelToString.firstKeySet()) {
            int nSingletons = 0;
            double totalCount = mweLabelToString.totalCount(mweLabel);
            Counter<String> mc = mweLabelToString.getCounter(mweLabel);
            for (String term : mc.keySet()) {
                if (mc.getCount(term) == 1.0)
                    nSingletons++;
                nTokens += term.split("\\s+").length * (int) mc.getCount(term);
            }
            nAllSingletons += nSingletons;
            System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
        }
        System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
        System.out.println("#tokens = " + nTokens);
        System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) InputStreamReader(java.io.InputStreamReader) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) File(java.io.File)

Example 15 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class FTBCorrector method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
        System.exit(-1);
    }
    TreeTransformer tt = new FTBCorrector();
    File f = new File(args[0]);
    try {
        //These bad trees in the Candito training set should be thrown out:
        //  (ROOT (SENT (" ") (. .)))
        //  (ROOT (SENT (. .)))
        TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
        TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            TregexMatcher m = pBadTree.matcher(t);
            TregexMatcher m2 = pBadTree2.matcher(t);
            if (m.find() || m2.find()) {
                log.info("Discarding tree: " + t.toString());
            } else {
                Tree fixedT = tt.transformTree(t);
                System.out.println(fixedT.toString());
            }
        }
        tr.close();
        System.err.printf("Wrote %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Aggregations

TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)24 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)16 Tree (edu.stanford.nlp.trees.Tree)10 CoreLabel (edu.stanford.nlp.ling.CoreLabel)9 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)6 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)5 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)5 Mention (edu.stanford.nlp.coref.data.Mention)3 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)3 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)3 TregexParseException (edu.stanford.nlp.trees.tregex.TregexParseException)3 IntPair (edu.stanford.nlp.util.IntPair)3 Label (edu.stanford.nlp.ling.Label)2 SerializableFunction (edu.stanford.nlp.process.SerializableFunction)2 TreeReader (edu.stanford.nlp.trees.TreeReader)2 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)2 TsurgeonPattern (edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern)2 Pair (edu.stanford.nlp.util.Pair)2