Search in sources :

Example 21 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class MWEFrequencyDist method main.

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
    Set<String> uniquePOSSequences = Generics.newHashSet();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        final TregexPattern pMWE = TregexPattern.compile("/^MW/");
        for (Tree t; (t = tr.readTree()) != null; ) {
            //Count MWE statistics
            TregexMatcher m = pMWE.matcher(t);
            while (m.findNextMatchingNode()) {
                Tree match = m.getMatch();
                String label = match.value();
                List<CoreLabel> yield = match.taggedLabeledYield();
                StringBuilder termYield = new StringBuilder();
                StringBuilder posYield = new StringBuilder();
                for (CoreLabel cl : yield) {
                    termYield.append(cl.word()).append(" ");
                    posYield.append(cl.tag()).append(" ");
                }
                mweLabelToString.incrementCount(label, termYield.toString().trim());
                uniquePOSSequences.add(posYield.toString().trim());
            }
        }
        //Closes the underlying reader
        tr.close();
        System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
        double nMWEs = mweLabelToString.totalCount();
        int nAllSingletons = 0;
        int nTokens = 0;
        for (String mweLabel : mweLabelToString.firstKeySet()) {
            int nSingletons = 0;
            double totalCount = mweLabelToString.totalCount(mweLabel);
            Counter<String> mc = mweLabelToString.getCounter(mweLabel);
            for (String term : mc.keySet()) {
                if (mc.getCount(term) == 1.0)
                    nSingletons++;
                nTokens += term.split("\\s+").length * (int) mc.getCount(term);
            }
            nAllSingletons += nSingletons;
            System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
        }
        System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
        System.out.println("#tokens = " + nTokens);
        System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) InputStreamReader(java.io.InputStreamReader) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) File(java.io.File)

Example 22 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class FTBCorrector method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
        System.exit(-1);
    }
    TreeTransformer tt = new FTBCorrector();
    File f = new File(args[0]);
    try {
        //These bad trees in the Candito training set should be thrown out:
        //  (ROOT (SENT (" ") (. .)))
        //  (ROOT (SENT (. .)))
        TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
        TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            TregexMatcher m = pBadTree.matcher(t);
            TregexMatcher m2 = pBadTree2.matcher(t);
            if (m.find() || m2.find()) {
                log.info("Discarding tree: " + t.toString());
            } else {
                Tree fixedT = tt.transformTree(t);
                System.out.println(fixedT.toString());
            }
        }
        tr.close();
        System.err.printf("Wrote %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Example 23 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class MWEPreprocessor method countMWEStatistics.

public static void countMWEStatistics(Tree t, TwoDimensionalCounter<String, String> unigramTagger, TwoDimensionalCounter<String, String> labelPreterm, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> labelTerm, TwoDimensionalCounter<String, String> termLabel) {
    updateTagger(unigramTagger, t);
    //Count MWE statistics
    TregexMatcher m = pMWE.matcher(t);
    while (m.findNextMatchingNode()) {
        Tree match = m.getMatch();
        String label = match.value();
        if (RESOLVE_DUMMY_TAGS && label.equals(FrenchXMLTreeReader.MISSING_PHRASAL))
            continue;
        String preterm = SentenceUtils.listToString(match.preTerminalYield());
        String term = SentenceUtils.listToString(match.yield());
        labelPreterm.incrementCount(label, preterm);
        pretermLabel.incrementCount(preterm, label);
        labelTerm.incrementCount(label, term);
        termLabel.incrementCount(term, label);
    }
}
Also used : Tree(edu.stanford.nlp.trees.Tree) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher)

Example 24 with TregexMatcher

use of edu.stanford.nlp.trees.tregex.TregexMatcher in project CoreNLP by stanfordnlp.

the class RHSFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            rootMatch = TregexPattern.compile("@" + args[i++]);
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i++]);
        }
    }
    Counter<String> rhsCounter = new ClassicCounter<>();
    for (Tree t : tb) {
        TregexMatcher m = rootMatch.matcher(t);
        while (m.findNextMatchingNode()) {
            Tree match = m.getMatch();
            StringBuilder sb = new StringBuilder();
            for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
            rhsCounter.incrementCount(sb.toString().trim());
        }
    }
    List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
    PrintWriter pw = tlpp.pw();
    for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) PrintWriter(java.io.PrintWriter)

Aggregations

TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)24 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)16 Tree (edu.stanford.nlp.trees.Tree)10 CoreLabel (edu.stanford.nlp.ling.CoreLabel)9 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)6 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)5 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)5 Mention (edu.stanford.nlp.coref.data.Mention)3 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)3 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)3 TregexParseException (edu.stanford.nlp.trees.tregex.TregexParseException)3 IntPair (edu.stanford.nlp.util.IntPair)3 Label (edu.stanford.nlp.ling.Label)2 SerializableFunction (edu.stanford.nlp.process.SerializableFunction)2 TreeReader (edu.stanford.nlp.trees.TreeReader)2 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)2 TsurgeonPattern (edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern)2 Pair (edu.stanford.nlp.util.Pair)2