Search in sources :

Example 6 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class TreebankStats method main.

/**
   *
   * @param args
   */
public static void main(String[] args) {
    if (args.length < MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, optArgDefs());
    String splitPrefix = options.getProperty("s", null);
    boolean SHOW_WORDS = PropertiesUtils.getBool(options, "w", false);
    boolean pathsAreFiles = PropertiesUtils.getBool(options, "f", false);
    boolean SHOW_OOV = PropertiesUtils.getBool(options, "o", false);
    String[] parsedArgs = options.getProperty("", "").split("\\s+");
    if (parsedArgs.length != MIN_ARGS) {
        log.info(usage());
        System.exit(-1);
    }
    Language language = Language.valueOf(parsedArgs[0]);
    List<String> corpusPaths = new ArrayList<>(parsedArgs.length - 1);
    for (int i = 1; i < parsedArgs.length; ++i) {
        corpusPaths.add(parsedArgs[i]);
    }
    TreebankLangParserParams tlpp = language.params;
    TreebankStats cs = new TreebankStats(language, corpusPaths, tlpp);
    if (splitPrefix != null) {
        if (!cs.useSplit(splitPrefix))
            log.info("Could not load split!");
    }
    cs.run(pathsAreFiles, SHOW_WORDS, SHOW_OOV);
}
Also used : Language(edu.stanford.nlp.international.Language) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties)

Example 7 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class RuleBranchingFactor method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage);
        System.exit(-1);
    }
    // Process command-line options
    Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
    String fileName = options.getProperty("");
    if (fileName == null || fileName.equals("")) {
        System.out.println(usage);
        System.exit(-1);
    }
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);
    // Statistics
    Counter<String> binaryRuleTypes = new ClassicCounter<>(20000);
    List<Integer> branchingFactors = new ArrayList<>(20000);
    int nTrees = 0;
    int nUnaryRules = 0;
    int nBinaryRules = 0;
    int binaryBranchingFactors = 0;
    // Read the treebank
    PrintWriter pw = tlpp.pw();
    for (Tree tree : tb) {
        if (tree.value().equals("ROOT")) {
            tree = tree.firstChild();
        }
        ++nTrees;
        for (Tree subTree : tree) {
            if (subTree.isPhrasal()) {
                if (subTree.numChildren() > 1) {
                    ++nBinaryRules;
                    branchingFactors.add(subTree.numChildren());
                    binaryBranchingFactors += subTree.numChildren();
                    binaryRuleTypes.incrementCount(treeToRuleString(subTree));
                } else {
                    ++nUnaryRules;
                }
            }
        }
    }
    double mean = (double) binaryBranchingFactors / (double) nBinaryRules;
    System.out.printf("#trees:\t%d%n", nTrees);
    System.out.printf("#binary:\t%d%n", nBinaryRules);
    System.out.printf("#binary types:\t%d%n", binaryRuleTypes.keySet().size());
    System.out.printf("mean branching:\t%.4f%n", mean);
    System.out.printf("stddev branching:\t%.4f%n", standardDeviation(branchingFactors, mean));
    System.out.printf("rule entropy:\t%.5f%n", Counters.entropy(binaryRuleTypes));
    System.out.printf("#unaries:\t%d%n", nUnaryRules);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 8 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class ManipulateTopBracket method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, argDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    DiskTreebank tb = null;
    String encoding = options.getProperty("l", "UTF-8");
    boolean removeBracket = PropertiesUtils.getBool(options, "b", false);
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    tb = tlpp.diskTreebank();
    String[] files = options.getProperty("", "").split("\\s+");
    if (files.length != 0) {
        for (String filename : files) {
            tb.loadPath(filename);
        }
    } else {
        log.info(usage());
        System.exit(-1);
    }
    PrintWriter pwo = tlpp.pw();
    String startSymbol = tlpp.treebankLanguagePack().startSymbol();
    TreeFactory tf = new LabeledScoredTreeFactory();
    int nTrees = 0;
    for (Tree t : tb) {
        if (removeBracket) {
            if (t.value().equals(startSymbol)) {
                t = t.firstChild();
            }
        } else if (!t.value().equals(startSymbol)) {
            //Add a bracket if it isn't already there
            t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
        }
        pwo.println(t.toString());
        nTrees++;
    }
    pwo.close();
    System.err.printf("Processed %d trees.%n", nTrees);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Language(edu.stanford.nlp.international.Language) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) TreeFactory(edu.stanford.nlp.trees.TreeFactory) Tree(edu.stanford.nlp.trees.Tree) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) PrintWriter(java.io.PrintWriter)

Example 9 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class PunctFrequencyDist method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    String puncTag = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            puncTag = args[i++];
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    Counter<String> puncTypes = new ClassicCounter<>();
    for (Tree t : tb) {
        List<CoreLabel> yield = t.taggedLabeledYield();
        for (CoreLabel word : yield) if (word.tag().equals(puncTag))
            puncTypes.incrementCount(word.word());
    }
    List<String> biggestKeys = new ArrayList<>(puncTypes.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(puncTypes));
    PrintWriter pw = tlpp.pw();
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) puncTypes.getCount(wordType));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 10 with TreebankLangParserParams

use of edu.stanford.nlp.parser.lexparser.TreebankLangParserParams in project CoreNLP by stanfordnlp.

the class UNKPrinter method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    Language lang = Language.English;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    PrintWriter pw = tlpp.pw();
    Options op = new Options();
    Options.LexOptions lexOptions = op.lexOptions;
    if (lang == Language.French) {
        lexOptions.useUnknownWordSignatures = 1;
        lexOptions.smartMutation = false;
        lexOptions.unknownSuffixSize = 2;
        lexOptions.unknownPrefixSize = 1;
    } else if (lang == Language.Arabic) {
        lexOptions.smartMutation = false;
        lexOptions.useUnknownWordSignatures = 9;
        lexOptions.unknownPrefixSize = 1;
        lexOptions.unknownSuffixSize = 1;
    }
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
    int computeAfter = (int) (0.50 * tb.size());
    Counter<String> vocab = new ClassicCounter<>();
    Counter<String> unkCounter = new ClassicCounter<>();
    int treeId = 0;
    for (Tree t : tb) {
        List<Label> yield = t.yield();
        int posId = 0;
        for (Label word : yield) {
            vocab.incrementCount(word.value());
            if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
                //          if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
                //            pw.println(word.value());
                unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
        }
        treeId++;
    }
    List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
    pw.close();
    pw.close();
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Lexicon(edu.stanford.nlp.parser.lexparser.Lexicon) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) HashIndex(edu.stanford.nlp.util.HashIndex) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Aggregations

TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)17 Tree (edu.stanford.nlp.trees.Tree)15 Language (edu.stanford.nlp.international.Language)14 PrintWriter (java.io.PrintWriter)13 Label (edu.stanford.nlp.ling.Label)8 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)8 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)7 Treebank (edu.stanford.nlp.trees.Treebank)7 ArrayList (java.util.ArrayList)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)6 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)6 Properties (java.util.Properties)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)5 Map (java.util.Map)3 ArabicMorphoFeatureSpecification (edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification)1 FrenchMorphoFeatureSpecification (edu.stanford.nlp.international.french.FrenchMorphoFeatureSpecification)1 MorphoFeatureSpecification (edu.stanford.nlp.international.morph.MorphoFeatureSpecification)1 RuntimeIOException (edu.stanford.nlp.io.RuntimeIOException)1 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1