Search in sources :

Example 16 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class MungeTreesWithMorfetteAnalyses method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.printf("Usage: java %s tree_file morfette_tnt_file%n", MungeTreesWithMorfetteAnalyses.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    String morfetteFile = args[1];
    TreeReaderFactory trf = new FrenchTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
        Iterator<List<CoreLabel>> morfetteItr = new MorfetteFileIterator(morfetteFile);
        for (Tree tree; (tree = tr.readTree()) != null && morfetteItr.hasNext(); ) {
            List<CoreLabel> analysis = morfetteItr.next();
            List<Label> yield = tree.yield();
            assert analysis.size() == yield.size();
            int yieldLen = yield.size();
            for (int i = 0; i < yieldLen; ++i) {
                CoreLabel tokenAnalysis = analysis.get(i);
                Label token = yield.get(i);
                String lemma = getLemma(token.value(), tokenAnalysis.lemma());
                String newLeaf = String.format("%s%s%s%s%s", token.value(), MorphoFeatureSpecification.MORPHO_MARK, lemma, MorphoFeatureSpecification.LEMMA_MARK, tokenAnalysis.tag());
                ((CoreLabel) token).setValue(newLeaf);
            }
            System.out.println(tree.toString());
        }
        if (tr.readTree() != null || morfetteItr.hasNext()) {
            log.info("WARNING: Uneven input files!");
        }
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) ArrayList(java.util.ArrayList) List(java.util.List) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 17 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class TreeToMorfette method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s tree_file%n", TreeToMorfette.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    TreeReaderFactory trf = new FrenchTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
        for (Tree tree1; (tree1 = tr.readTree()) != null; ) {
            List<Label> pretermYield = tree1.preTerminalYield();
            List<Label> yield = tree1.yield();
            int yieldLen = yield.size();
            for (int i = 0; i < yieldLen; ++i) {
                CoreLabel rawToken = (CoreLabel) yield.get(i);
                String word = rawToken.value();
                String morphStr = rawToken.originalText();
                Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, morphStr);
                String lemma = lemmaMorph.first();
                String morph = lemmaMorph.second();
                if (morph == null || morph.equals("") || morph.equals("XXX")) {
                    morph = ((CoreLabel) pretermYield.get(i)).value();
                }
                System.out.printf("%s %s %s%n", word, lemma, morph);
            }
            System.out.println();
        }
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 18 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class FTBCorrector method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
        System.exit(-1);
    }
    TreeTransformer tt = new FTBCorrector();
    File f = new File(args[0]);
    try {
        //These bad trees in the Candito training set should be thrown out:
        //  (ROOT (SENT (" ") (. .)))
        //  (ROOT (SENT (. .)))
        TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
        TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            TregexMatcher m = pBadTree.matcher(t);
            TregexMatcher m2 = pBadTree2.matcher(t);
            if (m.find() || m2.find()) {
                log.info("Discarding tree: " + t.toString());
            } else {
                Tree fixedT = tt.transformTree(t);
                System.out.println(fixedT.toString());
            }
        }
        tr.close();
        System.err.printf("Wrote %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Example 19 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class MWEPreprocessor method resolveDummyTags.

private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> unigramTagger) {
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            traverseAndFix(t, pretermLabel, unigramTagger);
            pw.println(t.toString());
        }
        pw.close();
        tr.close();
        System.out.println("Processed " + nTrees + " trees");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchXMLTreeReader(edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 20 with TreeReader

use of edu.stanford.nlp.trees.TreeReader in project CoreNLP by stanfordnlp.

the class MultiWordPreprocessor method resolveDummyTags.

private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER, TreeNormalizer tn) {
    TreeFactory tf = new LabeledScoredTreeFactory();
    MultiWordTreeExpander expander = new MultiWordTreeExpander();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new SpanishTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            traverseAndFix(t, null, unigramTagger, retainNER);
            // Now "decompress" further the expanded trees formed by
            // multiword token splitting
            t = expander.expandPhrases(t, tn, tf);
            if (tn != null)
                t = tn.normalizeWholeTree(t, tf);
            pw.println(t.toString());
        }
        pw.close();
        tr.close();
        System.out.println("Processed " + nTrees + " trees");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TreeReader(edu.stanford.nlp.trees.TreeReader) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) TreeFactory(edu.stanford.nlp.trees.TreeFactory) Tree(edu.stanford.nlp.trees.Tree) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory)

Aggregations

TreeReader (edu.stanford.nlp.trees.TreeReader)20 Tree (edu.stanford.nlp.trees.Tree)19 TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)17 IOException (java.io.IOException)7 CoreLabel (edu.stanford.nlp.ling.CoreLabel)6 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)6 FileInputStream (java.io.FileInputStream)6 InputStreamReader (java.io.InputStreamReader)6 BufferedReader (java.io.BufferedReader)5 FileNotFoundException (java.io.FileNotFoundException)5 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)3 PennTreeReader (edu.stanford.nlp.trees.PennTreeReader)3 TreebankLanguagePack (edu.stanford.nlp.trees.TreebankLanguagePack)3 SpanishTreeReaderFactory (edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory)3 Pattern (java.util.regex.Pattern)3 Label (edu.stanford.nlp.ling.Label)2 LabeledScoredTreeReaderFactory (edu.stanford.nlp.trees.LabeledScoredTreeReaderFactory)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2 ArabicTreeReaderFactory (edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory)2