Search in sources :

Example 1 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class NoPunctuationHeadFinder method main.

public static void main(String[] args) {
    // simple testing code
    Treebank treebank = new DiskTreebank();
    CategoryWordTag.suppressTerminalDetails = true;
    treebank.loadPath(args[0]);
    final HeadFinder chf = new NoPunctuationHeadFinder();
    treebank.apply(pt -> {
        pt.percolateHeads(chf);
        pt.pennPrint();
        System.out.println();
    });
}
Also used : ModCollinsHeadFinder(edu.stanford.nlp.trees.ModCollinsHeadFinder) HeadFinder(edu.stanford.nlp.trees.HeadFinder) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Treebank(edu.stanford.nlp.trees.Treebank)

Example 2 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class TreebankStats method run.

public void run(boolean pathsAreFiles, boolean displayWords, boolean displayOOV) {
    if (useSplit) {
        List<ObservedCorpusStats> allSplitStats = new ArrayList<>();
        makeVocab = true;
        for (Map.Entry<Split, Set<String>> split : splitFileLists.entrySet()) {
            DiskTreebank tb = tlpp.diskTreebank();
            FileFilter splitFilter = new SplitFilter(split.getValue());
            for (String path : pathNames) tb.loadPath(path, splitFilter);
            ObservedCorpusStats splitStats = gatherStats(tb, languageName.toString() + "." + split.getKey().toString());
            allSplitStats.add(splitStats);
            makeVocab = false;
        }
        display(aggregateStats(allSplitStats), displayWords, displayOOV);
        for (ObservedCorpusStats ocs : allSplitStats) display(ocs, displayWords, displayOOV);
    } else if (pathsAreFiles) {
        makeVocab = true;
        for (String path : pathNames) {
            DiskTreebank tb = tlpp.diskTreebank();
            tb.loadPath(path, pathname -> true);
            ObservedCorpusStats stats = gatherStats(tb, languageName.toString() + "  " + path);
            display(stats, displayWords, displayOOV);
            makeVocab = false;
        }
    } else {
        trainVocab = Generics.newHashSet();
        DiskTreebank tb = tlpp.diskTreebank();
        for (String path : pathNames) tb.loadPath(path, pathname -> !pathname.isDirectory());
        ObservedCorpusStats allStats = gatherStats(tb, languageName.toString());
        display(allStats, displayWords, displayOOV);
    }
}
Also used : Properties(java.util.Properties) Counters(edu.stanford.nlp.stats.Counters) IOUtils(edu.stanford.nlp.io.IOUtils) Redwood(edu.stanford.nlp.util.logging.Redwood) DecimalFormat(java.text.DecimalFormat) PropertiesUtils(edu.stanford.nlp.util.PropertiesUtils) Set(java.util.Set) Tree(edu.stanford.nlp.trees.Tree) File(java.io.File) NumberFormat(java.text.NumberFormat) Stack(java.util.Stack) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) List(java.util.List) FileFilter(java.io.FileFilter) Counter(edu.stanford.nlp.stats.Counter) StringUtils(edu.stanford.nlp.util.StringUtils) Map(java.util.Map) Language(edu.stanford.nlp.international.Language) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Pair(edu.stanford.nlp.util.Pair) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Collections(java.util.Collections) Generics(edu.stanford.nlp.util.Generics) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Set(java.util.Set) ArrayList(java.util.ArrayList) FileFilter(java.io.FileFilter) Map(java.util.Map)

Example 3 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class RuleBranchingFactor method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage);
        System.exit(-1);
    }
    // Process command-line options
    Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
    String fileName = options.getProperty("");
    if (fileName == null || fileName.equals("")) {
        System.out.println(usage);
        System.exit(-1);
    }
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);
    // Statistics
    Counter<String> binaryRuleTypes = new ClassicCounter<>(20000);
    List<Integer> branchingFactors = new ArrayList<>(20000);
    int nTrees = 0;
    int nUnaryRules = 0;
    int nBinaryRules = 0;
    int binaryBranchingFactors = 0;
    // Read the treebank
    PrintWriter pw = tlpp.pw();
    for (Tree tree : tb) {
        if (tree.value().equals("ROOT")) {
            tree = tree.firstChild();
        }
        ++nTrees;
        for (Tree subTree : tree) {
            if (subTree.isPhrasal()) {
                if (subTree.numChildren() > 1) {
                    ++nBinaryRules;
                    branchingFactors.add(subTree.numChildren());
                    binaryBranchingFactors += subTree.numChildren();
                    binaryRuleTypes.incrementCount(treeToRuleString(subTree));
                } else {
                    ++nUnaryRules;
                }
            }
        }
    }
    double mean = (double) binaryBranchingFactors / (double) nBinaryRules;
    System.out.printf("#trees:\t%d%n", nTrees);
    System.out.printf("#binary:\t%d%n", nBinaryRules);
    System.out.printf("#binary types:\t%d%n", binaryRuleTypes.keySet().size());
    System.out.printf("mean branching:\t%.4f%n", mean);
    System.out.printf("stddev branching:\t%.4f%n", standardDeviation(branchingFactors, mean));
    System.out.printf("rule entropy:\t%.5f%n", Counters.entropy(binaryRuleTypes));
    System.out.printf("#unaries:\t%d%n", nUnaryRules);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 4 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class ManipulateTopBracket method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage());
        System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, argDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    DiskTreebank tb = null;
    String encoding = options.getProperty("l", "UTF-8");
    boolean removeBracket = PropertiesUtils.getBool(options, "b", false);
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    tb = tlpp.diskTreebank();
    String[] files = options.getProperty("", "").split("\\s+");
    if (files.length != 0) {
        for (String filename : files) {
            tb.loadPath(filename);
        }
    } else {
        log.info(usage());
        System.exit(-1);
    }
    PrintWriter pwo = tlpp.pw();
    String startSymbol = tlpp.treebankLanguagePack().startSymbol();
    TreeFactory tf = new LabeledScoredTreeFactory();
    int nTrees = 0;
    for (Tree t : tb) {
        if (removeBracket) {
            if (t.value().equals(startSymbol)) {
                t = t.firstChild();
            }
        } else if (!t.value().equals(startSymbol)) {
            //Add a bracket if it isn't already there
            t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
        }
        pwo.println(t.toString());
        nTrees++;
    }
    pwo.close();
    System.err.printf("Processed %d trees.%n", nTrees);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Language(edu.stanford.nlp.international.Language) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) TreeFactory(edu.stanford.nlp.trees.TreeFactory) Tree(edu.stanford.nlp.trees.Tree) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) LabeledScoredTreeFactory(edu.stanford.nlp.trees.LabeledScoredTreeFactory) PrintWriter(java.io.PrintWriter)

Example 5 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class PunctFrequencyDist method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    String puncTag = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            puncTag = args[i++];
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    Counter<String> puncTypes = new ClassicCounter<>();
    for (Tree t : tb) {
        List<CoreLabel> yield = t.taggedLabeledYield();
        for (CoreLabel word : yield) if (word.tag().equals(puncTag))
            puncTypes.incrementCount(word.word());
    }
    List<String> biggestKeys = new ArrayList<>(puncTypes.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(puncTypes));
    PrintWriter pw = tlpp.pw();
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) puncTypes.getCount(wordType));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Aggregations

DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)11 Language (edu.stanford.nlp.international.Language)8 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)8 Tree (edu.stanford.nlp.trees.Tree)8 PrintWriter (java.io.PrintWriter)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)6 ArrayList (java.util.ArrayList)6 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)4 Properties (java.util.Properties)4 Treebank (edu.stanford.nlp.trees.Treebank)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 Label (edu.stanford.nlp.ling.Label)2 NumberFormat (java.text.NumberFormat)2 List (java.util.List)2 IOUtils (edu.stanford.nlp.io.IOUtils)1 NumberRangesFileFilter (edu.stanford.nlp.io.NumberRangesFileFilter)1 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1 Counter (edu.stanford.nlp.stats.Counter)1 Counters (edu.stanford.nlp.stats.Counters)1