Search in sources :

Example 16 with TreeReaderFactory

use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.

the class AddMorphoAnnotations method main.

/**
   * 
   * @param args
   */
public static void main(String[] args) {
    if (args.length < minArgs) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, argSpec());
    String encoding = options.getProperty("e", "UTF-8");
    boolean isMorphTreeFile = PropertiesUtils.getBool(options, "g", false);
    String[] parsedArgs = options.getProperty("").split("\\s+");
    if (parsedArgs.length != 2) {
        log.info(usage());
        System.exit(-1);
    }
    YieldIterator morphIter = new YieldIterator(parsedArgs[0], isMorphTreeFile);
    YieldIterator lemmaIter = new YieldIterator(parsedArgs[1], false);
    final Pattern pParenStripper = Pattern.compile("[\\(\\)]");
    try {
        BufferedReader brIn = new BufferedReader(new InputStreamReader(System.in, encoding));
        TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true);
        int nTrees = 0;
        for (String line; (line = brIn.readLine()) != null; ++nTrees) {
            Tree tree = trf.newTreeReader(new StringReader(line)).readTree();
            List<Tree> leaves = tree.getLeaves();
            if (!morphIter.hasNext()) {
                throw new RuntimeException("Mismatch between number of morpho analyses and number of input lines.");
            }
            List<String> morphTags = morphIter.next();
            if (!lemmaIter.hasNext()) {
                throw new RuntimeException("Mismatch between number of lemmas and number of input lines.");
            }
            List<String> lemmas = lemmaIter.next();
            // Sanity checks
            assert morphTags.size() == lemmas.size();
            assert lemmas.size() == leaves.size();
            for (int i = 0; i < leaves.size(); ++i) {
                String morphTag = morphTags.get(i);
                if (pParenStripper.matcher(morphTag).find()) {
                    morphTag = pParenStripper.matcher(morphTag).replaceAll("");
                }
                String newLeaf = String.format("%s%s%s%s%s", leaves.get(i).value(), MorphoFeatureSpecification.MORPHO_MARK, lemmas.get(i), MorphoFeatureSpecification.LEMMA_MARK, morphTag);
                leaves.get(i).setValue(newLeaf);
            }
            System.out.println(tree.toString());
        }
        // Sanity checks
        assert !morphIter.hasNext();
        assert !lemmaIter.hasNext();
        System.err.printf("Processed %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : Pattern(java.util.regex.Pattern) Properties(java.util.Properties) Tree(edu.stanford.nlp.trees.Tree) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) ArabicTreeReaderFactory(edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory)

Example 17 with TreeReaderFactory

use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.

the class FTBCorrector method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        log.info("Usage: java " + FTBCorrector.class.getName() + " filename\n");
        System.exit(-1);
    }
    TreeTransformer tt = new FTBCorrector();
    File f = new File(args[0]);
    try {
        //These bad trees in the Candito training set should be thrown out:
        //  (ROOT (SENT (" ") (. .)))
        //  (ROOT (SENT (. .)))
        TregexPattern pBadTree = TregexPattern.compile("@SENT <: @PUNC");
        TregexPattern pBadTree2 = TregexPattern.compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            TregexMatcher m = pBadTree.matcher(t);
            TregexMatcher m2 = pBadTree2.matcher(t);
            if (m.find() || m2.find()) {
                log.info("Discarding tree: " + t.toString());
            } else {
                Tree fixedT = tt.transformTree(t);
                System.out.println(fixedT.toString());
            }
        }
        tr.close();
        System.err.printf("Wrote %d trees%n", nTrees);
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    }
}
Also used : TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Example 18 with TreeReaderFactory

use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.

the class MWEPreprocessor method resolveDummyTags.

private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> unigramTagger) {
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            traverseAndFix(t, pretermLabel, unigramTagger);
            pw.println(t.toString());
        }
        pw.close();
        tr.close();
        System.out.println("Processed " + nTrees + " trees");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchXMLTreeReader(edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 19 with TreeReaderFactory

use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.

the class MultiWordPreprocessor method resolveDummyTags.

private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> unigramTagger, boolean retainNER, TreeNormalizer tn) {
    TreeFactory tf = new LabeledScoredTreeFactory();
    MultiWordTreeExpander expander = new MultiWordTreeExpander();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new SpanishTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        PrintWriter pw = new PrintWriter(new PrintStream(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
        int nTrees = 0;
        for (Tree t; (t = tr.readTree()) != null; nTrees++) {
            traverseAndFix(t, null, unigramTagger, retainNER);
            // Now "decompress" further the expanded trees formed by
            // multiword token splitting
            t = expander.expandPhrases(t, tn, tf);
            if (tn != null)
                t = tn.normalizeWholeTree(t, tf);
            pw.println(t.toString());
        }
        pw.close();
        tr.close();
        System.out.println("Processed " + nTrees + " trees");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TreeReader(edu.stanford.nlp.trees.TreeReader) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) TreeFactory(edu.stanford.nlp.trees.TreeFactory) Tree(edu.stanford.nlp.trees.Tree) SpanishTreeReaderFactory(edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory)

Example 20 with TreeReaderFactory

use of edu.stanford.nlp.trees.TreeReaderFactory in project CoreNLP by stanfordnlp.

the class TaggedFileRecord method createRecord.

public static TaggedFileRecord createRecord(Properties config, String description) {
    String[] pieces = description.split(",");
    if (pieces.length == 1) {
        return new TaggedFileRecord(description, Format.TEXT, getEncoding(config), getTagSeparator(config), null, null, null, null, null, null, null);
    }
    String[] args = new String[pieces.length - 1];
    System.arraycopy(pieces, 0, args, 0, pieces.length - 1);
    String file = pieces[pieces.length - 1];
    Format format = Format.TEXT;
    String encoding = getEncoding(config);
    String tagSeparator = getTagSeparator(config);
    TreeTransformer treeTransformer = null;
    TreeNormalizer treeNormalizer = null;
    TreeReaderFactory trf = null;
    NumberRangesFileFilter treeRange = null;
    Predicate<Tree> treeFilter = null;
    Integer wordColumn = null, tagColumn = null;
    for (String arg : args) {
        String[] argPieces = arg.split("=", 2);
        if (argPieces.length != 2) {
            throw new IllegalArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s");
        }
        if (argPieces[0].equalsIgnoreCase(FORMAT)) {
            format = Format.valueOf(argPieces[1]);
        } else if (argPieces[0].equalsIgnoreCase(ENCODING)) {
            encoding = argPieces[1];
        } else if (argPieces[0].equalsIgnoreCase(TAG_SEPARATOR)) {
            tagSeparator = argPieces[1];
        } else if (argPieces[0].equalsIgnoreCase(TREE_TRANSFORMER)) {
            treeTransformer = ReflectionLoading.loadByReflection(argPieces[1]);
        } else if (argPieces[0].equalsIgnoreCase(TREE_NORMALIZER)) {
            treeNormalizer = ReflectionLoading.loadByReflection(argPieces[1]);
        } else if (argPieces[0].equalsIgnoreCase(TREE_READER)) {
            trf = ReflectionLoading.loadByReflection(argPieces[1]);
        } else if (argPieces[0].equalsIgnoreCase(TREE_RANGE)) {
            String range = argPieces[1].replaceAll(":", ",");
            treeRange = new NumberRangesFileFilter(range, true);
        } else if (argPieces[0].equalsIgnoreCase(TREE_FILTER)) {
            treeFilter = ReflectionLoading.loadByReflection(argPieces[1]);
        } else if (argPieces[0].equalsIgnoreCase(WORD_COLUMN)) {
            wordColumn = Integer.valueOf(argPieces[1]);
        } else if (argPieces[0].equalsIgnoreCase(TAG_COLUMN)) {
            tagColumn = Integer.valueOf(argPieces[1]);
        } else {
            throw new IllegalArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown");
        }
    }
    return new TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn);
}
Also used : TreeNormalizer(edu.stanford.nlp.trees.TreeNormalizer) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Tree(edu.stanford.nlp.trees.Tree) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer)

Aggregations

TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)23 Tree (edu.stanford.nlp.trees.Tree)20 TreeReader (edu.stanford.nlp.trees.TreeReader)17 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)7 FileInputStream (java.io.FileInputStream)7 InputStreamReader (java.io.InputStreamReader)7 BufferedReader (java.io.BufferedReader)6 IOException (java.io.IOException)6 FileNotFoundException (java.io.FileNotFoundException)5 UnsupportedEncodingException (java.io.UnsupportedEncodingException)5 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 ArabicTreeReaderFactory (edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory)4 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)3 LabeledScoredTreeReaderFactory (edu.stanford.nlp.trees.LabeledScoredTreeReaderFactory)3 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)3 TreebankLanguagePack (edu.stanford.nlp.trees.TreebankLanguagePack)3 FrenchXMLTreeReader (edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader)3 SpanishTreeReaderFactory (edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory)3 Label (edu.stanford.nlp.ling.Label)2 PennTreeReader (edu.stanford.nlp.trees.PennTreeReader)2