Search in sources :

Example 1 with LexicalizedParser

use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.

the class CharacterLevelTagExtender method main.

/**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
public static void main(String[] args) throws IOException {
    if (args.length != 3) {
        throw new RuntimeException("args: treebankPath trainNums testNums");
    }
    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;
    LexicalizedParser lp;
    try {
        FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);
        lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
        try {
            String filename = "chineseCharTagPCFG.ser.gz";
            log.info("Writing parser in serialized format to file " + filename + " ");
            System.err.flush();
            ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
            out.writeObject(lp);
            out.close();
            log.info("done.");
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    } catch (IllegalArgumentException e) {
        lp = LexicalizedParser.loadModel(args[1], op);
    }
    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
        Tree tree;
        try {
            tree = lp.parseTree(gold.yieldHasWord());
            if (tree == null) {
                System.out.println("Failed to parse " + gold.yieldHasWord());
                continue;
            }
        } catch (Exception e) {
            e.printStackTrace();
            continue;
        }
        gold = gold.firstChild();
        pw.println(SentenceUtils.listToString(gold.preTerminalYield()));
        pw.println(SentenceUtils.listToString(gold.yield()));
        gold.pennPrint(pw);
        pw.println(tree.preTerminalYield());
        pw.println(tree.yield());
        tree.pennPrint(pw);
        //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
        //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
        //      eval.eval(allBrackets, goldBrackets);
        eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) EquivalenceClassEval(edu.stanford.nlp.stats.EquivalenceClassEval) ChineseTreebankParserParams(edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter)

Example 2 with LexicalizedParser

use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.

the class TaggerParserPosTagCompatibilityITest method testTagSet4.

private static void testTagSet4(String[] lexParsers, String[] maxentTaggers, String[] srParsers, String[] nnDepParsers) {
    LexicalizedParser lp = LexicalizedParser.loadModel(lexParsers[0]);
    Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
    for (String name : maxentTaggers) {
        MaxentTagger tagger = new MaxentTagger(name);
        assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, tagger.tagSet()) + "; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n", tagSet, tagger.tagSet());
    }
    for (String name : lexParsers) {
        LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
        assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) + "; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n", tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
    }
    for (String name : srParsers) {
        ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
        assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, srp.tagSet()) + "; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n", tagSet, srp.tagSet());
    }
    for (String name : nnDepParsers) {
        DependencyParser dp = DependencyParser.loadFromModelFile(name);
        assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" + "left - right: " + Sets.diff(tagSet, dp.getPosSet()) + "; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n", tagSet, dp.getPosSet());
    }
}
Also used : MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) ShiftReduceParser(edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser) DependencyParser(edu.stanford.nlp.parser.nndep.DependencyParser) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser)

Example 3 with LexicalizedParser

use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.

the class BuildBinarizedDataset method main.

/**
   * Turns a text file into trees for use in a RNTN classifier such as
   * the treebank used in the Sentiment project.
   * <br>
   * The expected input file is one sentence per line, with sentences
   * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
   * Lines after the first sentence line but before
   * the blank line will be treated as labeled sub-phrases.  The
   * labels should start with the label and then contain a list of
   * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
   *  For example:
   * <br>
   * <code>
   * 1 Today is not a good day.<br>
   * 3 good<br>
   * 3 good day <br>
   * 3 a good day <br>
   * <br>
   * (next block starts here) <br>
   * </code>
   * By default the englishPCFG parser is used.  This can be changed
   * with the <code>-parserModel</code> flag.  Specify an input file
   * with <code>-input</code>.
   * <br>
   * If a sentiment model is provided with -sentimentModel, that model
   * will be used to prelabel the sentences.  Any spans with given
   * labels will then be used to adjust those labels.
   */
public static void main(String[] args) {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String inputPath = null;
    String sentimentModelPath = null;
    SentimentModel sentimentModel = null;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-input")) {
            inputPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
            parserModel = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
            sentimentModelPath = args[argIndex + 1];
            argIndex += 2;
        } else {
            log.info("Unknown argument " + args[argIndex]);
            System.exit(2);
        }
    }
    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }
    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }
    String text = IOUtils.slurpFileNoExceptions(inputPath);
    // need blank line to make a new chunk
    String[] chunks = text.split("\\n\\s*\\n+");
    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.
        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //log.info(tokens);
        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }
        // TODO: add an option which treats the spans as constraints when parsing
        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
        }
        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();
        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        System.out.println(collapsedUnary);
    //System.out.println();
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TreeBinarizer(edu.stanford.nlp.parser.lexparser.TreeBinarizer) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) StringReader(java.io.StringReader) Tree(edu.stanford.nlp.trees.Tree) DocumentPreprocessor(edu.stanford.nlp.process.DocumentPreprocessor) Map(java.util.Map) Pair(edu.stanford.nlp.util.Pair)

Example 4 with LexicalizedParser

use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.

the class CrossValidateTestOptions method main.

public static void main(String[] args) throws IOException, ClassNotFoundException {
    String dvmodelFile = null;
    String lexparserFile = null;
    String testTreebankPath = null;
    FileFilter testTreebankFilter = null;
    List<String> unusedArgs = new ArrayList<>();
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-lexparser")) {
            lexparserFile = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-testTreebank")) {
            Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-testTreebank");
            argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;
            testTreebankPath = treebankDescription.first();
            testTreebankFilter = treebankDescription.second();
        } else {
            unusedArgs.add(args[argIndex++]);
        }
    }
    log.info("Loading lexparser from: " + lexparserFile);
    String[] newArgs = unusedArgs.toArray(new String[unusedArgs.size()]);
    LexicalizedParser lexparser = LexicalizedParser.loadModel(lexparserFile, newArgs);
    log.info("... done");
    Treebank testTreebank = null;
    if (testTreebankPath != null) {
        log.info("Reading in trees from " + testTreebankPath);
        if (testTreebankFilter != null) {
            log.info("Filtering on " + testTreebankFilter);
        }
        testTreebank = lexparser.getOp().tlpParams.memoryTreebank();
        ;
        testTreebank.loadPath(testTreebankPath, testTreebankFilter);
        log.info("Read in " + testTreebank.size() + " trees for testing");
    }
    double[] labelResults = new double[weights.length];
    double[] tagResults = new double[weights.length];
    for (int i = 0; i < weights.length; ++i) {
        lexparser.getOp().baseParserWeight = weights[i];
        EvaluateTreebank evaluator = new EvaluateTreebank(lexparser);
        evaluator.testOnTreebank(testTreebank);
        labelResults[i] = evaluator.getLBScore();
        tagResults[i] = evaluator.getTagScore();
    }
    for (int i = 0; i < weights.length; ++i) {
        log.info("LexicalizedParser weight " + weights[i] + ": labeled " + labelResults[i] + " tag " + tagResults[i]);
    }
}
Also used : EvaluateTreebank(edu.stanford.nlp.parser.lexparser.EvaluateTreebank) EvaluateTreebank(edu.stanford.nlp.parser.lexparser.EvaluateTreebank) Treebank(edu.stanford.nlp.trees.Treebank) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) ArrayList(java.util.ArrayList) FileFilter(java.io.FileFilter) Pair(edu.stanford.nlp.util.Pair)

Example 5 with LexicalizedParser

use of edu.stanford.nlp.parser.lexparser.LexicalizedParser in project CoreNLP by stanfordnlp.

the class DumpMatrices method main.

public static void main(String[] args) throws IOException {
    String modelPath = null;
    String outputDir = null;
    for (int argIndex = 0; argIndex < args.length; ) {
        if (args[argIndex].equalsIgnoreCase("-model")) {
            modelPath = args[argIndex + 1];
            argIndex += 2;
        } else if (args[argIndex].equalsIgnoreCase("-output")) {
            outputDir = args[argIndex + 1];
            argIndex += 2;
        } else {
            log.info("Unknown argument " + args[argIndex]);
            help();
        }
    }
    if (outputDir == null || modelPath == null) {
        help();
    }
    File outputFile = new File(outputDir);
    FileSystem.checkNotExistsOrFail(outputFile);
    FileSystem.mkdirOrFail(outputFile);
    LexicalizedParser parser = LexicalizedParser.loadModel(modelPath);
    DVModel model = DVParser.getModelFromLexicalizedParser(parser);
    String binaryWDir = outputDir + File.separator + "binaryW";
    FileSystem.mkdirOrFail(binaryWDir);
    for (TwoDimensionalMap.Entry<String, String, SimpleMatrix> entry : model.binaryTransform) {
        String filename = binaryWDir + File.separator + entry.getFirstKey() + "_" + entry.getSecondKey() + ".txt";
        dumpMatrix(filename, entry.getValue());
    }
    String binaryScoreDir = outputDir + File.separator + "binaryScore";
    FileSystem.mkdirOrFail(binaryScoreDir);
    for (TwoDimensionalMap.Entry<String, String, SimpleMatrix> entry : model.binaryScore) {
        String filename = binaryScoreDir + File.separator + entry.getFirstKey() + "_" + entry.getSecondKey() + ".txt";
        dumpMatrix(filename, entry.getValue());
    }
    String unaryWDir = outputDir + File.separator + "unaryW";
    FileSystem.mkdirOrFail(unaryWDir);
    for (Map.Entry<String, SimpleMatrix> entry : model.unaryTransform.entrySet()) {
        String filename = unaryWDir + File.separator + entry.getKey() + ".txt";
        dumpMatrix(filename, entry.getValue());
    }
    String unaryScoreDir = outputDir + File.separator + "unaryScore";
    FileSystem.mkdirOrFail(unaryScoreDir);
    for (Map.Entry<String, SimpleMatrix> entry : model.unaryScore.entrySet()) {
        String filename = unaryScoreDir + File.separator + entry.getKey() + ".txt";
        dumpMatrix(filename, entry.getValue());
    }
    String embeddingFile = outputDir + File.separator + "embeddings.txt";
    FileWriter fout = new FileWriter(embeddingFile);
    BufferedWriter bout = new BufferedWriter(fout);
    for (Map.Entry<String, SimpleMatrix> entry : model.wordVectors.entrySet()) {
        bout.write(entry.getKey());
        SimpleMatrix vector = entry.getValue();
        for (int i = 0; i < vector.numRows(); ++i) {
            bout.write("  " + vector.get(i, 0));
        }
        bout.write("\n");
    }
    bout.close();
    fout.close();
}
Also used : SimpleMatrix(org.ejml.simple.SimpleMatrix) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) FileWriter(java.io.FileWriter) TwoDimensionalMap(edu.stanford.nlp.util.TwoDimensionalMap) File(java.io.File) Map(java.util.Map) TwoDimensionalMap(edu.stanford.nlp.util.TwoDimensionalMap) BufferedWriter(java.io.BufferedWriter)

Aggregations

LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)20 Tree (edu.stanford.nlp.trees.Tree)7 Pair (edu.stanford.nlp.util.Pair)7 Treebank (edu.stanford.nlp.trees.Treebank)6 FileFilter (java.io.FileFilter)5 ArrayList (java.util.ArrayList)5 Map (java.util.Map)5 StringReader (java.io.StringReader)4 SimpleMatrix (org.ejml.simple.SimpleMatrix)4 HasWord (edu.stanford.nlp.ling.HasWord)3 EvaluateTreebank (edu.stanford.nlp.parser.lexparser.EvaluateTreebank)3 DocumentPreprocessor (edu.stanford.nlp.process.DocumentPreprocessor)3 BufferedWriter (java.io.BufferedWriter)3 FileWriter (java.io.FileWriter)3 Word (edu.stanford.nlp.ling.Word)2 ParserQuery (edu.stanford.nlp.parser.common.ParserQuery)2 Options (edu.stanford.nlp.parser.lexparser.Options)2 RerankingParserQuery (edu.stanford.nlp.parser.lexparser.RerankingParserQuery)2 DeepTree (edu.stanford.nlp.trees.DeepTree)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2