Search in sources :

Example 1 with FrenchTreeReaderFactory

use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.

the class Preferences method lookupTreeReaderFactory.

static TreeReaderFactory lookupTreeReaderFactory(String trfName) {
    if (trfName.equalsIgnoreCase("ArabicTreeReaderFactory")) {
        return new ArabicTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("ArabicTreeReaderFactory.ArabicRawTreeReaderFactory")) {
        return new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("CTBTreeReaderFactory")) {
        return new CTBTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("NoEmptiesCTBTreeReaderFactory")) {
        return new NoEmptiesCTBTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("Basic categories only (LabeledScoredTreeReaderFactory)")) {
        return new LabeledScoredTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("FrenchTreeReaderFactory")) {
        //PTB format
        return new FrenchTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("PennTreeReaderFactory")) {
        return new PennTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("StringLabeledScoredTreeReaderFactory")) {
        return new StringLabeledScoredTreeReaderFactory();
    } else if (trfName.equalsIgnoreCase("TregexTreeReaderFactory")) {
        return new TregexPattern.TRegexTreeReaderFactory();
    } else {
        //try to find the class
        try {
            Class<?> trfClass = Class.forName(trfName);
            TreeReaderFactory trf = (TreeReaderFactory) trfClass.newInstance();
            return trf;
        } catch (Exception e) {
            return new PennTreeReaderFactory();
        }
    }
}
Also used : ArabicTreeReaderFactory(edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory) StringLabeledScoredTreeReaderFactory(edu.stanford.nlp.trees.StringLabeledScoredTreeReaderFactory) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) NoEmptiesCTBTreeReaderFactory(edu.stanford.nlp.trees.international.pennchinese.NoEmptiesCTBTreeReaderFactory) CTBTreeReaderFactory(edu.stanford.nlp.trees.international.pennchinese.CTBTreeReaderFactory) NoEmptiesCTBTreeReaderFactory(edu.stanford.nlp.trees.international.pennchinese.NoEmptiesCTBTreeReaderFactory) PennTreeReaderFactory(edu.stanford.nlp.trees.PennTreeReaderFactory) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) CTBTreeReaderFactory(edu.stanford.nlp.trees.international.pennchinese.CTBTreeReaderFactory) StringLabeledScoredTreeReaderFactory(edu.stanford.nlp.trees.StringLabeledScoredTreeReaderFactory) NoEmptiesCTBTreeReaderFactory(edu.stanford.nlp.trees.international.pennchinese.NoEmptiesCTBTreeReaderFactory) PennTreeReaderFactory(edu.stanford.nlp.trees.PennTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) LabeledScoredTreeReaderFactory(edu.stanford.nlp.trees.LabeledScoredTreeReaderFactory) ArabicTreeReaderFactory(edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory) StringLabeledScoredTreeReaderFactory(edu.stanford.nlp.trees.StringLabeledScoredTreeReaderFactory) LabeledScoredTreeReaderFactory(edu.stanford.nlp.trees.LabeledScoredTreeReaderFactory)

Example 2 with FrenchTreeReaderFactory

use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.

the class MWEPreprocessor method main.

/**
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEPreprocessor.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<>();
    TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<>();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        for (Tree t; (t = tr.readTree()) != null; ) {
            countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
        }
        //Closes the underlying reader
        tr.close();
        System.out.println("Generating {MWE Type -> Terminal}");
        printCounter(labelTerm, "label_term.csv");
        System.out.println("Generating {Terminal -> MWE Type}");
        printCounter(termLabel, "term_label.csv");
        System.out.println("Generating {MWE Type -> POS sequence}");
        printCounter(labelPreterm, "label_pos.csv");
        System.out.println("Generating {POS sequence -> MWE Type}");
        printCounter(pretermLabel, "pos_label.csv");
        if (RESOLVE_DUMMY_TAGS) {
            System.out.println("Resolving DUMMY tags");
            resolveDummyTags(treeFile, pretermLabel, unigramTagger);
        }
        System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
        System.out.println("#Missing POS: " + nMissingPOS);
        System.out.println("#Missing Phrasal: " + nMissingPhrasal);
        System.out.println("Done!");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchXMLTreeReader(edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 3 with FrenchTreeReaderFactory

use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.

the class MWEFrequencyDist method main.

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s file%n", MWEFrequencyDist.class.getName());
        System.exit(-1);
    }
    final File treeFile = new File(args[0]);
    TwoDimensionalCounter<String, String> mweLabelToString = new TwoDimensionalCounter<>();
    Set<String> uniquePOSSequences = Generics.newHashSet();
    try {
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
        TreeReaderFactory trf = new FrenchTreeReaderFactory();
        TreeReader tr = trf.newTreeReader(br);
        final TregexPattern pMWE = TregexPattern.compile("/^MW/");
        for (Tree t; (t = tr.readTree()) != null; ) {
            //Count MWE statistics
            TregexMatcher m = pMWE.matcher(t);
            while (m.findNextMatchingNode()) {
                Tree match = m.getMatch();
                String label = match.value();
                List<CoreLabel> yield = match.taggedLabeledYield();
                StringBuilder termYield = new StringBuilder();
                StringBuilder posYield = new StringBuilder();
                for (CoreLabel cl : yield) {
                    termYield.append(cl.word()).append(" ");
                    posYield.append(cl.tag()).append(" ");
                }
                mweLabelToString.incrementCount(label, termYield.toString().trim());
                uniquePOSSequences.add(posYield.toString().trim());
            }
        }
        //Closes the underlying reader
        tr.close();
        System.out.printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
        double nMWEs = mweLabelToString.totalCount();
        int nAllSingletons = 0;
        int nTokens = 0;
        for (String mweLabel : mweLabelToString.firstKeySet()) {
            int nSingletons = 0;
            double totalCount = mweLabelToString.totalCount(mweLabel);
            Counter<String> mc = mweLabelToString.getCounter(mweLabel);
            for (String term : mc.keySet()) {
                if (mc.getCount(term) == 1.0)
                    nSingletons++;
                nTokens += term.split("\\s+").length * (int) mc.getCount(term);
            }
            nAllSingletons += nSingletons;
            System.out.printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int) totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
        }
        System.out.printf("TOTAL:\t%d\t%d\t%.2f%n", (int) nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
        System.out.println("#tokens = " + nTokens);
        System.out.println("#unique MWE POS sequences = " + uniquePOSSequences.size());
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (TregexParseException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) Tree(edu.stanford.nlp.trees.Tree) TregexParseException(edu.stanford.nlp.trees.tregex.TregexParseException) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) InputStreamReader(java.io.InputStreamReader) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) BufferedReader(java.io.BufferedReader) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) File(java.io.File)

Example 4 with FrenchTreeReaderFactory

use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.

the class MungeTreesWithMorfetteAnalyses method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.printf("Usage: java %s tree_file morfette_tnt_file%n", MungeTreesWithMorfetteAnalyses.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    String morfetteFile = args[1];
    TreeReaderFactory trf = new FrenchTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
        Iterator<List<CoreLabel>> morfetteItr = new MorfetteFileIterator(morfetteFile);
        for (Tree tree; (tree = tr.readTree()) != null && morfetteItr.hasNext(); ) {
            List<CoreLabel> analysis = morfetteItr.next();
            List<Label> yield = tree.yield();
            assert analysis.size() == yield.size();
            int yieldLen = yield.size();
            for (int i = 0; i < yieldLen; ++i) {
                CoreLabel tokenAnalysis = analysis.get(i);
                Label token = yield.get(i);
                String lemma = getLemma(token.value(), tokenAnalysis.lemma());
                String newLeaf = String.format("%s%s%s%s%s", token.value(), MorphoFeatureSpecification.MORPHO_MARK, lemma, MorphoFeatureSpecification.LEMMA_MARK, tokenAnalysis.tag());
                ((CoreLabel) token).setValue(newLeaf);
            }
            System.out.println(tree.toString());
        }
        if (tr.readTree() != null || morfetteItr.hasNext()) {
            log.info("WARNING: Uneven input files!");
        }
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) ArrayList(java.util.ArrayList) List(java.util.List) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Example 5 with FrenchTreeReaderFactory

use of edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory in project CoreNLP by stanfordnlp.

the class TreeToMorfette method main.

/**
   * @param args
   */
public static void main(String[] args) {
    if (args.length != 1) {
        System.err.printf("Usage: java %s tree_file%n", TreeToMorfette.class.getName());
        System.exit(-1);
    }
    String treeFile = args[0];
    TreeReaderFactory trf = new FrenchTreeReaderFactory();
    try {
        TreeReader tr = trf.newTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
        for (Tree tree1; (tree1 = tr.readTree()) != null; ) {
            List<Label> pretermYield = tree1.preTerminalYield();
            List<Label> yield = tree1.yield();
            int yieldLen = yield.size();
            for (int i = 0; i < yieldLen; ++i) {
                CoreLabel rawToken = (CoreLabel) yield.get(i);
                String word = rawToken.value();
                String morphStr = rawToken.originalText();
                Pair<String, String> lemmaMorph = MorphoFeatureSpecification.splitMorphString(word, morphStr);
                String lemma = lemmaMorph.first();
                String morph = lemmaMorph.second();
                if (morph == null || morph.equals("") || morph.equals("XXX")) {
                    morph = ((CoreLabel) pretermYield.get(i)).value();
                }
                System.out.printf("%s %s %s%n", word, lemma, morph);
            }
            System.out.println();
        }
        tr.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Label(edu.stanford.nlp.ling.Label) FileNotFoundException(java.io.FileNotFoundException) TreeReader(edu.stanford.nlp.trees.TreeReader) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CoreLabel(edu.stanford.nlp.ling.CoreLabel) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) BufferedReader(java.io.BufferedReader) Tree(edu.stanford.nlp.trees.Tree) FrenchTreeReaderFactory(edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory) TreeReaderFactory(edu.stanford.nlp.trees.TreeReaderFactory)

Aggregations

TreeReaderFactory (edu.stanford.nlp.trees.TreeReaderFactory)7 FrenchTreeReaderFactory (edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory)7 Tree (edu.stanford.nlp.trees.Tree)6 TreeReader (edu.stanford.nlp.trees.TreeReader)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 BufferedReader (java.io.BufferedReader)3 FileInputStream (java.io.FileInputStream)3 FileNotFoundException (java.io.FileNotFoundException)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)3 UnsupportedEncodingException (java.io.UnsupportedEncodingException)3 Label (edu.stanford.nlp.ling.Label)2 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)2 FrenchXMLTreeReader (edu.stanford.nlp.trees.international.french.FrenchXMLTreeReader)2 TregexMatcher (edu.stanford.nlp.trees.tregex.TregexMatcher)2 TregexParseException (edu.stanford.nlp.trees.tregex.TregexParseException)2 TregexPattern (edu.stanford.nlp.trees.tregex.TregexPattern)2 LabeledScoredTreeReaderFactory (edu.stanford.nlp.trees.LabeledScoredTreeReaderFactory)1 PennTreeReaderFactory (edu.stanford.nlp.trees.PennTreeReaderFactory)1 StringLabeledScoredTreeReaderFactory (edu.stanford.nlp.trees.StringLabeledScoredTreeReaderFactory)1