Search in sources :

Example 16 with TregexPattern

use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.

the class FTBCorrector method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
        System.exit(-1);
    }
    TreeTransformer tt = new FTBCorrector();
    File f = new File(args[0]);
    try {
        //These bad trees in the Candito training set should be thrown out:
        //  (ROOT (SENT (" ") (. .)))
        //  (ROOT (SENT (. .)))
        TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
        TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            TregexMatcher m = pBadTree.matcher(t);
            TregexMatcher m2 = pBadTree2.matcher(t);
            if (m.find() || m2.find()) {
                log.info("Discarding tree: " + t.toString());
            } else {
                Tree fixedT = tt.transformTree(t);
                System.out.println(fixedT.toString());
            }
        }
        tr.close();
        System.err.printf("Wrote %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Example 17 with TregexPattern

use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.

the class FTBDataset method build.

@Override
public void build() {
    for (File path : pathsToData) {
        treebank.loadPath(path, treeFileExtension, false);
    }
    PrintWriter outfile = null;
    PrintWriter flatFile = null;
    try {
        outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
        flatFile = (makeFlatFile) ? new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(flatFileName), "UTF-8"))) : null;
        outputFileList.add(outFileName);
        if (makeFlatFile) {
            outputFileList.add(flatFileName);
            toStringBuffer.append(" Made flat files\n");
        }
        preprocessMWEs();
        List<TregexPattern> badTrees = new ArrayList<>();
        //These trees appear in the Candito training set
        //They are mangled by the TreeCorrector, so discard them ahead of time.
        badTrees.add(TregexPattern.compile("@SENT <: @PUNC"));
        badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __"));
        //wsg2011: This filters out tree #552 in the Candito test set. We saved this tree for the
        //EMNLP2011 paper, but since it consists entirely of punctuation, it won't be evaluated anyway.
        //Since we aren't doing the split in this data set, just remove the tree.
        badTrees.add(TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC <3 @PUNC <4 @PUNC !<5 __"));
        for (Tree t : treebank) {
            //Filter out bad trees
            boolean skipTree = false;
            for (TregexPattern p : badTrees) {
                skipTree = p.matcher(t).find();
                if (skipTree)
                    break;
            }
            if (skipTree) {
                log.info("Discarding tree: " + t.toString());
                continue;
            }
            // Filter out trees that aren't in this part of the split
            if (splitSet != null) {
                String canditoTreeID = getCanditoTreeID(t);
                if (!splitSet.contains(canditoTreeID)) {
                    continue;
                }
            }
            if (customTreeVisitor != null)
                customTreeVisitor.visitTree(t);
            // outfile.printf("%s\t%s%n",treeName,t.toString());
            outfile.println(t.toString());
            if (makeFlatFile) {
                String flatString = (removeEscapeTokens) ? ATBTreeUtils.unEscape(ATBTreeUtils.flattenTree(t)) : ATBTreeUtils.flattenTree(t);
                flatFile.println(flatString);
            }
        }
    } catch (UnsupportedEncodingException e) {
        System.err.printf("%s: Filesystem does not support UTF-8 output%n", this.getClass().getName());
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        System.err.printf("%s: Could not open %s for writing%n", this.getClass().getName(), outFileName);
    } catch (TregexParseException e) {
        System.err.printf("%s: Could not compile Tregex expressions%n", this.getClass().getName());
        e.printStackTrace();
    } finally {
        if (outfile != null)
            outfile.close();
        if (flatFile != null)
            flatFile.close();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) BufferedWriter(java.io.BufferedWriter) FileOutputStream(java.io.FileOutputStream) Tree(edu.stanford.nlp.trees.Tree) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 18 with TregexPattern

use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.

the class ATBCorrector method loadOps.

private List<Pair<TregexPattern, TsurgeonPattern>> loadOps() {
    List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<>();
    String line = null;
    try {
        BufferedReader br = new BufferedReader(new StringReader(editStr));
        List<TsurgeonPattern> tsp = new ArrayList<>();
        while ((line = br.readLine()) != null) {
            if (DEBUG)
                log.info("Pattern is " + line);
            TregexPattern matchPattern = TregexPattern.compile(line);
            if (DEBUG)
                log.info(" [" + matchPattern + "]");
            tsp.clear();
            while (continuing(line = br.readLine())) {
                TsurgeonPattern p = Tsurgeon.parseOperation(line);
                if (DEBUG)
                    log.info("Operation is " + line + " [" + p + "]");
                tsp.add(p);
            }
            if (!tsp.isEmpty()) {
                TsurgeonPattern tp = Tsurgeon.collectOperations(tsp);
                ops.add(new Pair<>(matchPattern, tp));
            }
        }
    // while not at end of file
    } catch (IOException ioe) {
        ioe.printStackTrace();
    }
    return ops;
}
Also used : TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) TsurgeonPattern(edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern) Pair(edu.stanford.nlp.util.Pair)

Example 19 with TregexPattern

use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.

the class RHSFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            rootMatch = TregexPattern.compile("@" + args[i++]);
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i++]);
        }
    }
    Counter<String> rhsCounter = new ClassicCounter<>();
    for (Tree t : tb) {
        TregexMatcher m = rootMatch.matcher(t);
        while (m.findNextMatchingNode()) {
            Tree match = m.getMatch();
            StringBuilder sb = new StringBuilder();
            for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
            rhsCounter.incrementCount(sb.toString().trim());
        }
    }
    List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
    PrintWriter pw = tlpp.pw();
    for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) PrintWriter(java.io.PrintWriter)

Example 20 with TregexPattern

use of edu.stanford.nlp.trees.tregex.TregexPattern in project CoreNLP by stanfordnlp.

the class MentionExtractor method findTreePattern.

/** Find syntactic pattern in a sentence by tregex */
private void findTreePattern(Tree tree, String tregex, Set<Pair<Integer, Integer>> foundPairs) {
    try {
        TregexPattern tgrepPattern = TregexPattern.compile(tregex);
        findTreePattern(tree, tgrepPattern, foundPairs);
    } catch (Exception e) {
        // shouldn't happen....
        throw new RuntimeException(e);
    }
}
Also used : TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern)

Aggregations

TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)29 TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)16 Tree (edu.stanford.nlp.trees.Tree)8 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)6 ParserConstraint (edu.stanford.nlp.parser.common.ParserConstraint)6 Pair (edu.stanford.nlp.util.Pair)6 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)5 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)5 ArrayList (java.util.ArrayList)5 TregexParseException (edu.stanford.nlp.trees.tregex.TregexParseException)4 TsurgeonPattern (edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern)4 Mention (edu.stanford.nlp.coref.data.Mention)3 TreeCoreAnnotations (edu.stanford.nlp.trees.TreeCoreAnnotations)3 IntPair (edu.stanford.nlp.util.IntPair)3 IOException (java.io.IOException)3 PrintWriter (java.io.PrintWriter)3 SerializableFunction (edu.stanford.nlp.process.SerializableFunction)2 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)2 TreeReader (edu.stanford.nlp.trees.TreeReader)2