Search in sources :

Example 6 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class UNKPrinter method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    Language lang = Language.English;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    PrintWriter pw = tlpp.pw();
    Options op = new Options();
    Options.LexOptions lexOptions = op.lexOptions;
    if (lang == Language.French) {
        lexOptions.useUnknownWordSignatures = 1;
        lexOptions.smartMutation = false;
        lexOptions.unknownSuffixSize = 2;
        lexOptions.unknownPrefixSize = 1;
    } else if (lang == Language.Arabic) {
        lexOptions.smartMutation = false;
        lexOptions.useUnknownWordSignatures = 9;
        lexOptions.unknownPrefixSize = 1;
        lexOptions.unknownSuffixSize = 1;
    }
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    Lexicon lex = tlpp.lex(op, wordIndex, tagIndex);
    int computeAfter = (int) (0.50 * tb.size());
    Counter<String> vocab = new ClassicCounter<>();
    Counter<String> unkCounter = new ClassicCounter<>();
    int treeId = 0;
    for (Tree t : tb) {
        List<Label> yield = t.yield();
        int posId = 0;
        for (Label word : yield) {
            vocab.incrementCount(word.value());
            if (treeId > computeAfter && vocab.getCount(word.value()) < 2.0)
                //          if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
                //            pw.println(word.value());
                unkCounter.incrementCount(lex.getUnknownWordModel().getSignature(word.value(), posId++));
        }
        treeId++;
    }
    List<String> biggestKeys = new ArrayList<>(unkCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(unkCounter));
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) unkCounter.getCount(wordType));
    pw.close();
    pw.close();
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Lexicon(edu.stanford.nlp.parser.lexparser.Lexicon) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) HashIndex(edu.stanford.nlp.util.HashIndex) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 7 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class BaseLexicon method main.

/** Provides some testing and opportunities for exploration of the
   *  probabilities of a BaseLexicon.  What's here currently probably
   *  only works for the English Penn Treeebank, as it uses default
   *  constructors.  Of the words given to test on,
   *  the first is treated as sentence initial, and the rest as not
   *  sentence initial.
   *
   *  @param args The command line arguments:
   *     java BaseLexicon treebankPath fileRange unknownWordModel words*
   */
public static void main(String[] args) {
    if (args.length < 3) {
        log.info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
        return;
    }
    System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
    Treebank tb = new DiskTreebank();
    tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
    // TODO: change this interface so the lexicon creates its own indices?
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    Options op = new Options();
    op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
    BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
    lex.initializeTraining(tb.size());
    lex.train(tb);
    lex.finishTraining();
    System.out.println("done.");
    System.out.println();
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(4);
    List<String> impos = new ArrayList<>();
    for (int i = 3; i < args.length; i++) {
        if (lex.isKnown(args[i])) {
            System.out.println(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
            for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
                IntTaggedWord iTW = it.next();
                System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
            }
        } else {
            String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
            System.out.println(args[i] + " is an unknown word.  Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
            impos.clear();
            List<String> lis = new ArrayList<>(tagIndex.objectsList());
            Collections.sort(lis);
            for (String tStr : lis) {
                IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                double score = lex.score(iTW, 1, args[i], null);
                if (score == Float.NEGATIVE_INFINITY) {
                    impos.add(tStr);
                } else {
                    System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
                }
            }
            if (impos.size() > 0) {
                System.out.println(args[i] + " impossible tags: " + impos);
            }
        }
        System.out.println();
    }
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Treebank(edu.stanford.nlp.trees.Treebank) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) NumberFormat(java.text.NumberFormat)

Example 8 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class VocabFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i]);
        }
    }
    Counter<String> vocab = new ClassicCounter<>();
    for (Tree t : tb) {
        List<Label> yield = t.yield();
        for (Label word : yield) vocab.incrementCount(word.value());
    }
    List<String> biggestKeys = new ArrayList<>(vocab.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(vocab));
    PrintWriter pw = tlpp.pw();
    for (String wordType : biggestKeys) pw.printf("%s\t%d%n", wordType, (int) vocab.getCount(wordType));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) Label(edu.stanford.nlp.ling.Label) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Example 9 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class CountTrees method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage);
        System.exit(-1);
    }
    // Process command-line options
    Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
    String fileName = options.getProperty("");
    if (fileName == null || fileName.equals("")) {
        System.out.println(usage);
        System.exit(-1);
    }
    int maxLen = PropertiesUtils.getInt(options, "y", Integer.MAX_VALUE);
    boolean printTrees = PropertiesUtils.getBool(options, "p", false);
    boolean flattenTrees = PropertiesUtils.getBool(options, "f", false);
    boolean printPOS = PropertiesUtils.getBool(options, "a", false);
    boolean printTnT = PropertiesUtils.getBool(options, "t", false);
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);
    // Read the treebank
    PrintWriter pw = tlpp.pw();
    int numTrees = 0;
    for (Tree tree : tb) {
        if (tree.yield().size() > maxLen)
            continue;
        ++numTrees;
        if (printTrees) {
            pw.println(tree.toString());
        } else if (flattenTrees) {
            pw.println(SentenceUtils.listToString(tree.yield()));
        } else if (printPOS) {
            pw.println(SentenceUtils.listToString(tree.preTerminalYield()));
        } else if (printTnT) {
            List<CoreLabel> yield = tree.taggedLabeledYield();
            for (CoreLabel label : yield) {
                pw.printf("%s\t%s%n", label.word(), label.tag());
            }
            pw.println();
        }
    }
    System.err.printf("Read %d trees.%n", numTrees);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Language(edu.stanford.nlp.international.Language) Tree(edu.stanford.nlp.trees.Tree) List(java.util.List) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) PrintWriter(java.io.PrintWriter)

Example 10 with DiskTreebank

use of edu.stanford.nlp.trees.DiskTreebank in project CoreNLP by stanfordnlp.

the class RHSFrequency method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage.toString());
        System.exit(-1);
    }
    TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
    DiskTreebank tb = null;
    String encoding = "UTF-8";
    TregexPattern rootMatch = null;
    for (int i = 0; i < args.length; i++) {
        if (args[i].startsWith("-")) {
            switch(args[i]) {
                case "-l":
                    Language lang = Language.valueOf(args[++i].trim());
                    tlpp = lang.params;
                    break;
                case "-e":
                    encoding = args[++i];
                    break;
                default:
                    System.out.println(usage.toString());
                    System.exit(-1);
            }
        } else {
            rootMatch = TregexPattern.compile("@" + args[i++]);
            if (tb == null) {
                if (tlpp == null) {
                    System.out.println(usage.toString());
                    System.exit(-1);
                } else {
                    tlpp.setInputEncoding(encoding);
                    tlpp.setOutputEncoding(encoding);
                    tb = tlpp.diskTreebank();
                }
            }
            tb.loadPath(args[i++]);
        }
    }
    Counter<String> rhsCounter = new ClassicCounter<>();
    for (Tree t : tb) {
        TregexMatcher m = rootMatch.matcher(t);
        while (m.findNextMatchingNode()) {
            Tree match = m.getMatch();
            StringBuilder sb = new StringBuilder();
            for (Tree kid : match.children()) sb.append(kid.value()).append(" ");
            rhsCounter.incrementCount(sb.toString().trim());
        }
    }
    List<String> biggestKeys = new ArrayList<>(rhsCounter.keySet());
    Collections.sort(biggestKeys, Counters.toComparatorDescending(rhsCounter));
    PrintWriter pw = tlpp.pw();
    for (String rhs : biggestKeys) pw.printf("%s\t%d%n", rhs, (int) rhsCounter.getCount(rhs));
    pw.close();
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) TregexPattern(edu.stanford.nlp.trees.tregex.TregexPattern) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) EnglishTreebankParserParams(edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) TregexMatcher(edu.stanford.nlp.trees.tregex.TregexMatcher) PrintWriter(java.io.PrintWriter)

Aggregations

DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)11 Language (edu.stanford.nlp.international.Language)8 TreebankLangParserParams (edu.stanford.nlp.parser.lexparser.TreebankLangParserParams)8 Tree (edu.stanford.nlp.trees.Tree)8 PrintWriter (java.io.PrintWriter)7 ClassicCounter (edu.stanford.nlp.stats.ClassicCounter)6 ArrayList (java.util.ArrayList)6 EnglishTreebankParserParams (edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams)4 Properties (java.util.Properties)4 Treebank (edu.stanford.nlp.trees.Treebank)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)2 Label (edu.stanford.nlp.ling.Label)2 NumberFormat (java.text.NumberFormat)2 List (java.util.List)2 IOUtils (edu.stanford.nlp.io.IOUtils)1 NumberRangesFileFilter (edu.stanford.nlp.io.NumberRangesFileFilter)1 Lexicon (edu.stanford.nlp.parser.lexparser.Lexicon)1 Options (edu.stanford.nlp.parser.lexparser.Options)1 Counter (edu.stanford.nlp.stats.Counter)1 Counters (edu.stanford.nlp.stats.Counters)1