Search in sources :

Example 1 with TregexParseException

use of edu.stanford.nlp.trees.tregex.TregexParseException in project CoreNLP by stanfordnlp.

the class TregexPoweredTreebankParserParams method compileAnnotations.

/**
 * Compile the {@link #annotations} collection given a
 * particular head finder. Subclasses should call this method at
 * least once before the class is used, and whenever the head finder
 * is changed.
 */
protected void compileAnnotations(HeadFinder hf) {
    TregexPatternCompiler compiler = new TregexPatternCompiler(hf);
    annotationPatterns.clear();
    for (Map.Entry<String, Pair<String, Function<TregexMatcher, String>>> annotation : annotations.entrySet()) {
        TregexPattern compiled;
        try {
            compiled = compiler.compile(annotation.getValue().first());
        } catch (TregexParseException e) {
            int nth = annotationPatterns.size() + 1;
            log.info("Parse exception on annotation pattern #" + nth + " initialization: " + e);
            continue;
        }
        Pair<TregexPattern, Function<TregexMatcher, String>> behavior = new Pair<>(compiled, annotation.getValue().second());
        annotationPatterns.put(annotation.getKey(), behavior);
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) Function(java.util.function.Function) SerializableFunction(edu.stanford.nlp.process.SerializableFunction) TregexPatternCompiler(edu.stanford.nlp.trees.tregex.TregexPatternCompiler) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair)

Example 2 with TregexParseException

use of edu.stanford.nlp.trees.tregex.TregexParseException in project CoreNLP by stanfordnlp.

the class FTBCorrector method main.

/**
 * @param args
 */
public static void main(String[] args) {
    if (args.length != 1) {
        log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
        System.exit(-1);
    }
    TreeTransformer tt = new FTBCorrector();
    File f = new File(args[0]);
    try {
        // These bad trees in the Candito training set should be thrown out:
        // (ROOT (SENT (" ") (. .)))
        // (ROOT (SENT (. .)))
        TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
        TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            TregexMatcher m = pBadTree.matcher(t);
            TregexMatcher m2 = pBadTree2.matcher(t);
            if (m.find() || m2.find()) {
                log.info("Discarding tree: " + t.toString());
            } else {
                Tree fixedT = tt.transformTree(t);
                System.out.println(fixedT.toString());
            }
        }
        tr.close();
        System.err.printf("Wrote %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Example 3 with TregexParseException

use of edu.stanford.nlp.trees.tregex.TregexParseException in project CoreNLP by stanfordnlp.

the class FTBDataset method build.

@Override
public void build() {
    for (File path : pathsToData) {
        treebank.loadPath(path, treeFileExtension, false);
    }
    PrintWriter outfile = null;
    PrintWriter flatFile = null;
    try {
        outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
        flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName), "UTF-8"))) : null;
        outputFileList.add(outFileName);
        if (makeFlatFile) {
            outputFileList.add(flatFileName);
            toStringBuilder.append(" Made flat files\n");
        }
        preprocessMWEs();
        List<TregexPattern> badTrees = new ArrayList<>();
        // These trees appear in the Candito training set
        // They are mangled by the TreeCorrector, so discard them ahead of time.
        badTrees.add(TregexPattern.compile("@SENT <: @PUNC"));
        badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __"));
        // wsg2011: This filters out tree #552 in the Candito test set. We saved this tree for the
        // EMNLP2011 paper, but since it consists entirely of punctuation, it won't be evaluated anyway.
        // Since we aren't doing the split in this data set, just remove the tree.
        badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC <3 @PUNC <4 @PUNC !<5 __"));
        for (Tree t : treebank) {
            // Filter out bad trees
            boolean skipTree = false;
            for (TregexPattern p : badTrees) {
                skipTree = p.matcher(t).find();
                if (skipTree)
                    break;
            }
            if (skipTree) {
                log.info("Discarding tree: " + t.toString());
                continue;
            }
            // Filter out trees that aren't in this part of the split
            if (splitSet != null) {
                String canditoTreeID = getCanditoTreeID(t);
                if (!splitSet.contains(canditoTreeID)) {
                    continue;
                }
            }
            if (customTreeVisitor != null)
                customTreeVisitor.visitTree(t);
            // outfile.printf("%s\t%s%n",treeName,t.toString());
            outfile.println(t.toString());
            if (makeFlatFile) {
                String flatString = (removeEscapeTokens) ? ATBTreeUtils.unEscape(ATBTreeUtils.flattenTree(t)) : ATBTreeUtils.flattenTree(t);
                flatFile.println(flatString);
            }
        }
    } catch (UnsupportedEncodingException e) {
        System.err.printf("%s: Filesystem does not support UTF-8 output%n", this.getClass().getName());
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        System.err.printf("%s: Could not open %s for writing%n", this.getClass().getName(), outFileName);
    } catch (TregexParseException e) {
        System.err.printf("%s: Could not compile Tregex expressions%n", this.getClass().getName());
        e.printStackTrace();
    } finally {
        if (outfile != null)
            outfile.close();
        if (flatFile != null)
            flatFile.close();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) BufferedWriter(java.io.BufferedWriter) FileOutputStream(java.io.FileOutputStream) Tree(edu.stanford.nlp.trees.Tree) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 4 with TregexParseException

use of edu.stanford.nlp.trees.tregex.TregexParseException in project CoreNLP by stanfordnlp.

the class MWEFrequencyDist method main.

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
    Set<String> uniquePOSSequences = Generics.newHashSet();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        final TregexPattern pMWE = TregexPattern.compile("/^MW/");
        for (Tree t; (t = tr.readTree()) != null; ) {
            // Count MWE statistics
            TregexMatcher m = pMWE.matcher(t);
            while (m.findNextMatchingNode()) {
                Tree match = m.getMatch();
                String label = match.value();
                List<CoreLabel> yield = match.taggedLabeledYield();
                StringBuilder termYield = new StringBuilder();
                StringBuilder posYield = new StringBuilder();
                for (CoreLabel cl : yield) {
                    termYield.append(cl.word()).append(" ");
                    posYield.append(cl.tag()).append(" ");
                }
                mweLabelToString.incrementCount(label, termYield.toString().trim());
                uniquePOSSequences.add(posYield.toString().trim());
            }
        }
        // Closes the underlying reader
        tr.close();
        System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
        double nMWEs = mweLabelToString.totalCount();
        int nAllSingletons = 0;
        int nTokens = 0;
        for (String mweLabel : mweLabelToString.firstKeySet()) {
            int nSingletons = 0;
            double totalCount = mweLabelToString.totalCount(mweLabel);
            Counter<String> mc = mweLabelToString.getCounter(mweLabel);
            for (String term : mc.keySet()) {
                if (mc.getCount(term) == 1.0)
                    nSingletons++;
                nTokens += term.split("\\s+").length * (int) mc.getCount(term);
            }
            nAllSingletons += nSingletons;
            System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
        }
        System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
        System.out.println("#tokens = " + nTokens);
        System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) InputStreamReader(java.io.InputStreamReader) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) File(java.io.File)

Aggregations

TregexParseException (edu.stanford.nlp.trees.tregex.TregexParseException)4 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)4 Tree (edu.stanford.nlp.trees.Tree)3 TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)3 TreeReader (edu.stanford.nlp.trees.TreeReader)2 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)2 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)2 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 SerializableFunction (edu.stanford.nlp.process.SerializableFunction)1 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)1 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)1 TregexPatternCompiler (edu.stanford.nlp.trees.tregex.TregexPatternCompiler)1 Pair (edu.stanford.nlp.util.Pair)1 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1