Search in sources :

Example 6 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class ChineseMaxentLexicon method main.

public static void main(String[] args) {
    TreebankLangParserParams tlpParams = new ChineseTreebankParserParams();
    TreebankLanguagePack ctlp = tlpParams.treebankLanguagePack();
    Options op = new Options(tlpParams);
    TreeAnnotator ta = new TreeAnnotator(tlpParams.headFinder(), tlpParams, op);
    log.info("Reading Trees...");
    FileFilter trainFilter = new NumberRangesFileFilter(args[1], true);
    Treebank trainTreebank = tlpParams.memoryTreebank();
    trainTreebank.loadPath(args[0], trainFilter);
    log.info("Annotating trees...");
    Collection<Tree> trainTrees = new ArrayList<>();
    for (Tree tree : trainTreebank) {
        trainTrees.add(ta.transformTree(tree));
    }
    // saves memory
    trainTreebank = null;
    log.info("Training lexicon...");
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    int featureLevel = DEFAULT_FEATURE_LEVEL;
    if (args.length > 3) {
        featureLevel = Integer.parseInt(args[3]);
    }
    ChineseMaxentLexicon lex = new ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel);
    lex.initializeTraining(trainTrees.size());
    lex.train(trainTrees);
    lex.finishTraining();
    log.info("Testing");
    FileFilter testFilter = new NumberRangesFileFilter(args[2], true);
    Treebank testTreebank = tlpParams.memoryTreebank();
    testTreebank.loadPath(args[0], testFilter);
    List<TaggedWord> testWords = new ArrayList<>();
    for (Tree t : testTreebank) {
        for (TaggedWord tw : t.taggedYield()) {
            testWords.add(tw);
        }
    //testWords.addAll(t.taggedYield());
    }
    int[] totalAndCorrect = lex.testOnTreebank(testWords);
    log.info("done.");
    System.out.println(totalAndCorrect[1] + " correct out of " + totalAndCorrect[0] + " -- ACC: " + ((double) totalAndCorrect[1]) / totalAndCorrect[0]);
}
Also used : NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Treebank(edu.stanford.nlp.trees.Treebank) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Tree(edu.stanford.nlp.trees.Tree) TreebankLanguagePack(edu.stanford.nlp.trees.TreebankLanguagePack) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter)

Example 7 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class ChineseTreebankParserParams method main.

/**
   * For testing: loads a treebank and prints the trees.
   */
public static void main(String[] args) {
    TreebankLangParserParams tlpp = new ChineseTreebankParserParams();
    System.out.println("Default encoding is: " + tlpp.diskTreebank().encoding());
    if (args.length < 2) {
        printlnErr("Usage: edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams treesPath fileRange");
    } else {
        Treebank m = tlpp.diskTreebank();
        m.loadPath(args[0], new NumberRangesFileFilter(args[1], false));
        for (Tree t : m) {
            t.pennPrint(tlpp.pw());
        }
        System.out.println("There were " + m.size() + " trees.");
    }
}
Also used : NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter)

Example 8 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class ChineseLexiconAndWordSegmenter method main.

/** This method lets you train and test a segmenter relative to a
   *  Treebank.
   *  <p>
   *  <i>Implementation note:</i> This method is largely cloned from
   *  LexicalizedParser's main method.  Should we try to have it be able
   *  to train segmenters to stop things going out of sync?
   */
public static void main(String[] args) {
    boolean train = false;
    boolean saveToSerializedFile = false;
    boolean saveToTextFile = false;
    String serializedInputFileOrUrl = null;
    String textInputFileOrUrl = null;
    String serializedOutputFileOrUrl = null;
    String textOutputFileOrUrl = null;
    String treebankPath = null;
    Treebank testTreebank = null;
    // Treebank tuneTreebank = null;
    String testPath = null;
    FileFilter testFilter = null;
    FileFilter trainFilter = null;
    String encoding = null;
    // variables needed to process the files to be parsed
    TokenizerFactory<Word> tokenizerFactory = null;
    //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
    // whether or not the input file has already been tokenized
    boolean tokenized = false;
    Function<List<HasWord>, List<HasWord>> escaper = new ChineseEscaper();
    // int tagDelimiter = -1;
    // String sentenceDelimiter = "\n";
    // boolean fromXML = false;
    int argIndex = 0;
    if (args.length < 1) {
        log.info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
        return;
    }
    Options op = new Options();
    op.tlpParams = new ChineseTreebankParserParams();
    // while loop through option arguments
    while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
        if (args[argIndex].equalsIgnoreCase("-train")) {
            train = true;
            saveToSerializedFile = true;
            int numSubArgs = numSubArgs(args, argIndex);
            argIndex++;
            if (numSubArgs > 1) {
                treebankPath = args[argIndex];
                argIndex++;
            } else {
                throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
            }
            if (numSubArgs == 2) {
                trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
            } else if (numSubArgs >= 3) {
                try {
                    int low = Integer.parseInt(args[argIndex]);
                    int high = Integer.parseInt(args[argIndex + 1]);
                    trainFilter = new NumberRangeFileFilter(low, high, true);
                    argIndex += 2;
                } catch (NumberFormatException e) {
                    // maybe it's a ranges expression?
                    trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                    argIndex++;
                }
            }
        } else if (args[argIndex].equalsIgnoreCase("-encoding")) {
            // sets encoding for TreebankLangParserParams
            encoding = args[argIndex + 1];
            op.tlpParams.setInputEncoding(encoding);
            op.tlpParams.setOutputEncoding(encoding);
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
            // load the parser from a binary serialized file
            // the next argument must be the path to the parser file
            serializedInputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        // doesn't make sense to load from TextFile -pichuan
        //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
        //        // load the parser from declarative text file
        //        // the next argument must be the path to the parser file
        //        textInputFileOrUrl = args[argIndex + 1];
        //        argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
            saveToSerializedFile = true;
            serializedOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
            // save the parser to declarative text file
            saveToTextFile = true;
            textOutputFileOrUrl = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-treebank")) {
            // the next argument is the treebank path and range for testing
            int numSubArgs = numSubArgs(args, argIndex);
            argIndex++;
            if (numSubArgs == 1) {
                testFilter = new NumberRangesFileFilter(args[argIndex++], true);
            } else if (numSubArgs > 1) {
                testPath = args[argIndex++];
                if (numSubArgs == 2) {
                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                } else if (numSubArgs >= 3) {
                    try {
                        int low = Integer.parseInt(args[argIndex]);
                        int high = Integer.parseInt(args[argIndex + 1]);
                        testFilter = new NumberRangeFileFilter(low, high, true);
                        argIndex += 2;
                    } catch (NumberFormatException e) {
                        // maybe it's a ranges expression?
                        testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    }
                }
            }
        } else {
            int j = op.tlpParams.setOptionFlag(args, argIndex);
            if (j == argIndex) {
                log.info("Unknown option ignored: " + args[argIndex]);
                j++;
            }
            argIndex = j;
        }
    }
    // end while loop through arguments
    TreebankLangParserParams tlpParams = op.tlpParams;
    // all other arguments are order dependent and
    // are processed in order below
    ChineseLexiconAndWordSegmenter cs = null;
    if (!train && op.testOptions.verbose) {
        System.out.println("Currently " + new Date());
        printArgs(args, System.out);
    }
    if (train) {
        printArgs(args, System.out);
        // so we train a parser using the treebank
        if (treebankPath == null) {
            // the next arg must be the treebank path, since it wasn't give earlier
            treebankPath = args[argIndex];
            argIndex++;
            if (args.length > argIndex + 1) {
                try {
                    // the next two args might be the range
                    int low = Integer.parseInt(args[argIndex]);
                    int high = Integer.parseInt(args[argIndex + 1]);
                    trainFilter = new NumberRangeFileFilter(low, high, true);
                    argIndex += 2;
                } catch (NumberFormatException e) {
                    // maybe it's a ranges expression?
                    trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                    argIndex++;
                }
            }
        }
        Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);
        Index<String> wordIndex = new HashIndex<>();
        Index<String> tagIndex = new HashIndex<>();
        cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
    } else if (textInputFileOrUrl != null) {
    // so we load the segmenter from a text grammar file
    // XXXXX fix later -pichuan
    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
    } else {
        // so we load a serialized segmenter
        if (serializedInputFileOrUrl == null) {
            // the next argument must be the path to the serialized parser
            serializedInputFileOrUrl = args[argIndex];
            argIndex++;
        }
        try {
            cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
        } catch (IllegalArgumentException e) {
            log.info("Error loading segmenter, exiting...");
            System.exit(0);
        }
    }
    // the following has to go after reading parser to make sure
    // op and tlpParams are the same for train and test
    TreePrint treePrint = op.testOptions.treePrint(tlpParams);
    if (testFilter != null) {
        if (testPath == null) {
            if (treebankPath == null) {
                throw new RuntimeException("No test treebank path specified...");
            } else {
                log.info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                testPath = treebankPath;
            }
        }
        testTreebank = tlpParams.testMemoryTreebank();
        testTreebank.loadPath(testPath, testFilter);
    }
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(tlpParams.sisterSplitters()));
    // -- Roger
    if (op.testOptions.verbose) {
        log.info("Lexicon is " + cs.getClass().getName());
    }
    PrintWriter pwOut = tlpParams.pw();
    PrintWriter pwErr = tlpParams.pw(System.err);
    // Now what do we do with the parser we've made
    if (saveToTextFile) {
        // save the parser to textGrammar format
        if (textOutputFileOrUrl != null) {
            saveSegmenterDataToText(cs, textOutputFileOrUrl);
        } else {
            log.info("Usage: must specify a text segmenter data output path");
        }
    }
    if (saveToSerializedFile) {
        if (serializedOutputFileOrUrl == null && argIndex < args.length) {
            // the next argument must be the path to serialize to
            serializedOutputFileOrUrl = args[argIndex];
            argIndex++;
        }
        if (serializedOutputFileOrUrl != null) {
            saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
        } else if (textOutputFileOrUrl == null && testTreebank == null) {
            // no saving/parsing request has been specified
            log.info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
        }
    }
    /* --------------------- Testing part!!!! ----------------------- */
    if (op.testOptions.verbose) {
    //      printOptions(false, op);
    }
    if (testTreebank != null || (argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank"))) {
        // test parser on treebank
        if (testTreebank == null) {
            // the next argument is the treebank path and range for testing
            testTreebank = tlpParams.testMemoryTreebank();
            if (args.length < argIndex + 4) {
                testTreebank.loadPath(args[argIndex + 1]);
            } else {
                int testlow = Integer.parseInt(args[argIndex + 2]);
                int testhigh = Integer.parseInt(args[argIndex + 3]);
                testTreebank.loadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
            }
        }
    /* TODO - test segmenting on treebank. -pichuan */
    //      lp.testOnTreebank(testTreebank);
    //    } else if (argIndex >= args.length) {
    //      // no more arguments, so we just parse our own test sentence
    //      if (lp.parse(op.tlpParams.defaultTestSentence())) {
    //        treePrint.printTree(lp.getBestParse(), pwOut);
    //      } else {
    //        pwErr.println("Error. Can't parse test sentence: " +
    //              lp.parse(op.tlpParams.defaultTestSentence()));
    //      }
    }
//wsg2010: This code block doesn't actually do anything. It appears to read and tokenize a file, and then just print it.
//         There are easier ways to do that. This code was copied from an old version of LexicalizedParser.
//    else {
//      // We parse filenames given by the remaining arguments
//      int numWords = 0;
//      Timing timer = new Timing();
//      // set the tokenizer
//      if (tokenized) {
//        tokenizerFactory = WhitespaceTokenizer.factory();
//      }
//      TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
//      if (tokenizerFactory == null) {
//        tokenizerFactory = (TokenizerFactory<Word>) tlp.getTokenizerFactory();
//      }
//      documentPreprocessor.setTokenizerFactory(tokenizerFactory);
//      documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
//      if (encoding != null) {
//        documentPreprocessor.setEncoding(encoding);
//      }
//      timer.start();
//      for (int i = argIndex; i < args.length; i++) {
//        String filename = args[i];
//        try {
//          List document = null;
//          if (fromXML) {
//            document = documentPreprocessor.getSentencesFromXML(filename, sentenceDelimiter, tokenized);
//          } else {
//            document = documentPreprocessor.getSentencesFromText(filename, escaper, sentenceDelimiter, tagDelimiter);
//          }
//          log.info("Segmenting file: " + filename + " with " + document.size() + " sentences.");
//          PrintWriter pwo = pwOut;
//          if (op.testOptions.writeOutputFiles) {
//            try {
//              pwo = tlpParams.pw(new FileOutputStream(filename + ".stp"));
//            } catch (IOException ioe) {
//              ioe.printStackTrace();
//            }
//          }
//          int num = 0;
//          treePrint.printHeader(pwo, tlp.getEncoding());
//          for (Iterator it = document.iterator(); it.hasNext();) {
//            num++;
//            List sentence = (List) it.next();
//            int len = sentence.size();
//            numWords += len;
////            pwErr.println("Parsing [sent. " + num + " len. " + len + "]: " + sentence);
//            pwo.println(Sentence.listToString(sentence));
//          }
//          treePrint.printFooter(pwo);
//          if (op.testOptions.writeOutputFiles) {
//            pwo.close();
//          }
//        } catch (IOException e) {
//          pwErr.println("Couldn't find file: " + filename);
//        }
//
//      } // end for each file
//      long millis = timer.stop();
//      double wordspersec = numWords / (((double) millis) / 1000);
//      NumberFormat nf = new DecimalFormat("0.00"); // easier way!
//      pwErr.println("Segmented " + numWords + " words at " + nf.format(wordspersec) + " words per second.");
//    }
}
Also used : NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter) HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Word(edu.stanford.nlp.ling.Word) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) ChineseEscaper(edu.stanford.nlp.trees.international.pennchinese.ChineseEscaper) HashIndex(edu.stanford.nlp.util.HashIndex) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter)

Example 9 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class BaseLexicon method main.

/** Provides some testing and opportunities for exploration of the
   *  probabilities of a BaseLexicon.  What's here currently probably
   *  only works for the English Penn Treeebank, as it uses default
   *  constructors.  Of the words given to test on,
   *  the first is treated as sentence initial, and the rest as not
   *  sentence initial.
   *
   *  @param args The command line arguments:
   *     java BaseLexicon treebankPath fileRange unknownWordModel words*
   */
public static void main(String[] args) {
    if (args.length < 3) {
        log.info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
        return;
    }
    System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
    Treebank tb = new DiskTreebank();
    tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
    // TODO: change this interface so the lexicon creates its own indices?
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    Options op = new Options();
    op.lexOptions.useUnknownWordSignatures = Integer.parseInt(args[2]);
    BaseLexicon lex = new BaseLexicon(op, wordIndex, tagIndex);
    lex.initializeTraining(tb.size());
    lex.train(tb);
    lex.finishTraining();
    System.out.println("done.");
    System.out.println();
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(4);
    List<String> impos = new ArrayList<>();
    for (int i = 3; i < args.length; i++) {
        if (lex.isKnown(args[i])) {
            System.out.println(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
            for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.addToIndex(args[i]), i - 3, null); it.hasNext(); ) {
                IntTaggedWord iTW = it.next();
                System.out.println(StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word), null)));
            }
        } else {
            String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
            System.out.println(args[i] + " is an unknown word.  Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
            impos.clear();
            List<String> lis = new ArrayList<>(tagIndex.objectsList());
            Collections.sort(lis);
            for (String tStr : lis) {
                IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                double score = lex.score(iTW, 1, args[i], null);
                if (score == Float.NEGATIVE_INFINITY) {
                    impos.add(tStr);
                } else {
                    System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
                }
            }
            if (impos.size() > 0) {
                System.out.println(args[i] + " impossible tags: " + impos);
            }
        }
        System.out.println();
    }
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Treebank(edu.stanford.nlp.trees.Treebank) DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) NumberFormat(java.text.NumberFormat)

Example 10 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class ArgUtils method getWeightedTreebankDescription.

public static Triple<String, FileFilter, Double> getWeightedTreebankDescription(String[] args, int argIndex, String flag) {
    String path = null;
    FileFilter filter = null;
    Double weight = 1.0;
    // the next arguments are the treebank path and maybe the range for testing
    int numSubArgs = numSubArgs(args, argIndex);
    if (numSubArgs > 0 && numSubArgs < 4) {
        argIndex++;
        path = args[argIndex++];
        boolean hasWeight = false;
        if (numSubArgs > 1 && DOUBLE_PATTERN.matcher(args[argIndex + numSubArgs - 2]).matches()) {
            weight = Double.parseDouble(args[argIndex + numSubArgs - 2]);
            hasWeight = true;
            numSubArgs--;
        }
        if (numSubArgs == 2) {
            filter = new NumberRangesFileFilter(args[argIndex++], true);
        } else if (numSubArgs == 3) {
            try {
                int low = Integer.parseInt(args[argIndex]);
                int high = Integer.parseInt(args[argIndex + 1]);
                filter = new NumberRangeFileFilter(low, high, true);
                argIndex += 2;
            } catch (NumberFormatException e) {
                // maybe it's a ranges expression?
                filter = new NumberRangesFileFilter(args[argIndex++], true);
            }
        }
        if (hasWeight) {
            argIndex++;
        }
    } else {
        throw new IllegalArgumentException("Bad arguments after " + flag);
    }
    return Triple.makeTriple(path, filter, weight);
}
Also used : NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) FileFilter(java.io.FileFilter) NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter)

Aggregations

NumberRangesFileFilter (edu.stanford.nlp.io.NumberRangesFileFilter)11 NumberRangeFileFilter (edu.stanford.nlp.io.NumberRangeFileFilter)3 TaggedWord (edu.stanford.nlp.ling.TaggedWord)3 Tree (edu.stanford.nlp.trees.Tree)3 FileFilter (java.io.FileFilter)3 HasWord (edu.stanford.nlp.ling.HasWord)2 Word (edu.stanford.nlp.ling.Word)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2 Treebank (edu.stanford.nlp.trees.Treebank)2 HashIndex (edu.stanford.nlp.util.HashIndex)2 File (java.io.File)2 StringLabel (edu.stanford.nlp.ling.StringLabel)1 ChineseTreebankParserParams (edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams)1 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)1 Options (edu.stanford.nlp.parser.lexparser.Options)1 WordSegmenter (edu.stanford.nlp.process.WordSegmenter)1 EquivalenceClassEval (edu.stanford.nlp.stats.EquivalenceClassEval)1 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)1 MemoryTreebank (edu.stanford.nlp.trees.MemoryTreebank)1 TreeNormalizer (edu.stanford.nlp.trees.TreeNormalizer)1