Search in sources :

Example 1 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class TreeAnnotatorAndBinarizer method main.

/** Lets you test out the TreeAnnotatorAndBinarizer on the command line.
   *
   *  @param args Command line arguments: All flags accepted by FactoredParser.setOptionFlag
   *     and -train treebankPath [fileRanges]
   */
public static void main(String[] args) {
    Options op = new Options();
    String treebankPath = null;
    FileFilter trainFilter = null;
    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
        if (args[i].equalsIgnoreCase("-train")) {
            int numSubArgs = numSubArgs(args, i);
            i++;
            if (numSubArgs >= 1) {
                treebankPath = args[i];
                i++;
            } else {
                throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
            }
            if (numSubArgs == 2) {
                trainFilter = new NumberRangesFileFilter(args[i++], true);
            } else if (numSubArgs >= 3) {
                int low = Integer.parseInt(args[i]);
                int high = Integer.parseInt(args[i + 1]);
                trainFilter = new NumberRangeFileFilter(low, high, true);
                i += 2;
            }
        } else {
            i = op.setOption(args, i);
        }
    }
    if (i < args.length) {
        log.info("usage: java TreeAnnotatorAndBinarizer options*");
        log.info("  Options are like for lexicalized parser including -train treebankPath fileRange]");
        return;
    }
    log.info("Annotating from treebank dir: " + treebankPath);
    Treebank trainTreebank = op.tlpParams.diskTreebank();
    if (trainFilter == null) {
        trainTreebank.loadPath(treebankPath);
    } else {
        trainTreebank.loadPath(treebankPath, trainFilter);
    }
    Treebank binaryTrainTreebank = getAnnotatedBinaryTreebankFromTreebank(trainTreebank, null, null, op).first();
    Iterator<Tree> it = trainTreebank.iterator();
    for (Tree t : binaryTrainTreebank) {
        System.out.println("Original tree:");
        it.next().pennPrint();
        System.out.println("Binarized tree:");
        t.pennPrint();
        System.out.println();
    }
}
Also used : NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) FileFilter(java.io.FileFilter) NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter)

Example 2 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class CharacterLevelTagExtender method main.

/**
   * for testing -- CURRENTLY BROKEN!!!
   *
   * @param args input dir and output filename
   * @throws IOException
   */
public static void main(String[] args) throws IOException {
    if (args.length != 3) {
        throw new RuntimeException("args: treebankPath trainNums testNums");
    }
    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    ctpp.charTags = true;
    // TODO: these options are getting clobbered by reading in the
    // parser object (unless it's a text file parser?)
    Options op = new Options(ctpp);
    op.doDep = false;
    op.testOptions.maxLength = 90;
    LexicalizedParser lp;
    try {
        FileFilter trainFilt = new NumberRangesFileFilter(args[1], false);
        lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op);
        try {
            String filename = "chineseCharTagPCFG.ser.gz";
            log.info("Writing parser in serialized format to file " + filename + " ");
            System.err.flush();
            ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
            out.writeObject(lp);
            out.close();
            log.info("done.");
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    } catch (IllegalArgumentException e) {
        lp = LexicalizedParser.loadModel(args[1], op);
    }
    FileFilter testFilt = new NumberRangesFileFilter(args[2], false);
    MemoryTreebank testTreebank = ctpp.memoryTreebank();
    testTreebank.loadPath(new File(args[0]), testFilt);
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
    WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
    WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
    EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck);
    //    System.out.println("Preterminals:" + preterminals);
    System.out.println("Testing...");
    for (Tree gold : testTreebank) {
        Tree tree;
        try {
            tree = lp.parseTree(gold.yieldHasWord());
            if (tree == null) {
                System.out.println("Failed to parse " + gold.yieldHasWord());
                continue;
            }
        } catch (Exception e) {
            e.printStackTrace();
            continue;
        }
        gold = gold.firstChild();
        pw.println(SentenceUtils.listToString(gold.preTerminalYield()));
        pw.println(SentenceUtils.listToString(gold.yield()));
        gold.pennPrint(pw);
        pw.println(tree.preTerminalYield());
        pw.println(tree.yield());
        tree.pennPrint(pw);
        //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
        //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
        //      eval.eval(allBrackets, goldBrackets);
        eval.displayLast();
    }
    System.out.println();
    System.out.println();
    eval.display();
}
Also used : Options(edu.stanford.nlp.parser.lexparser.Options) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) LexicalizedParser(edu.stanford.nlp.parser.lexparser.LexicalizedParser) EquivalenceClassEval(edu.stanford.nlp.stats.EquivalenceClassEval) ChineseTreebankParserParams(edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter)

Example 3 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class TregexGUI method doFileFilters.

private void doFileFilters(File[] files) {
    //System.out.println("Doing file filters");
    final File[] cFiles = files;
    final JPanel fileFilterPanel = new JPanel();
    fileFilterPanel.setLayout(new BoxLayout(fileFilterPanel, BoxLayout.PAGE_AXIS));
    JLabel text = new JLabel("<html>Please indicate any constraints on the files you want to load. All files in specified folders that satisfy all of the given constraints will be loaded. Just press Okay to load all files.</html>");
    //text.setBorder(BorderFactory.createLineBorder(Color.black));
    text.setAlignmentX(SwingConstants.LEADING);
    JPanel textPanel = new JPanel(new BorderLayout());
    textPanel.setPreferredSize(new Dimension(100, 50));
    //textPanel.setBorder(BorderFactory.createLineBorder(Color.black));
    textPanel.add(text);
    fileFilterPanel.add(textPanel);
    fileFilterPanel.add(Box.createVerticalStrut(5));
    Box defaultFilter = getNewFilter();
    //defaultFilter.setBorder(BorderFactory.createLineBorder(Color.black));
    //fileFilterPanel.setBorder(BorderFactory.createLineBorder(Color.black));
    fileFilterPanel.add(defaultFilter);
    final JOptionPane fileFilterDialog = new JOptionPane();
    fileFilterDialog.setMessage(fileFilterPanel);
    JButton[] options = new JButton[3];
    JButton okay = new JButton("Okay");
    JButton add = new JButton("Add another filter");
    JButton cancel = new JButton("Cancel");
    options[0] = okay;
    options[1] = add;
    options[2] = cancel;
    fileFilterDialog.setOptions(options);
    final JDialog dialog = fileFilterDialog.createDialog(null, "Set file filters...");
    okay.addActionListener(arg0 -> {
        final EnumMap<FilterType, String> filters = getFilters(fileFilterPanel);
        if (filters.containsKey(FilterType.isInRange)) {
            try {
                new NumberRangesFileFilter(filters.get(FilterType.isInRange), false);
            } catch (Exception e) {
                JOptionPane.showMessageDialog(dialog, new JLabel("<html>Please check the range you specified for the file number.  Ranges must be numerical, and disjoint <br>ranges should be separated by commas.  For example \"1-200,250-375\" is a valid range.</html>"), "Error in File Number Range", JOptionPane.ERROR_MESSAGE);
                return;
            }
        }
        dialog.setVisible(false);
        startFileLoadingThread(filters, cFiles);
    });
    add.addActionListener(e -> {
        fileFilterPanel.add(getNewFilter());
        dialog.pack();
    });
    cancel.addActionListener(e -> dialog.setVisible(false));
    dialog.getRootPane().setDefaultButton(okay);
    dialog.pack();
    dialog.setLocationRelativeTo(this);
    dialog.setVisible(true);
}
Also used : NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Dimension(java.awt.Dimension) IOException(java.io.IOException) BorderLayout(java.awt.BorderLayout) File(java.io.File)

Example 4 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class Treebanks method main.

/**
   * Loads treebank and prints it.
   * All files below the designated <code>filePath</code> within the given
   * number range if any are loaded.  You can normalize the trees or not
   * (English-specific) and print trees one per line up to a certain length
   * (for EVALB).
   * <p>
   * Usage: <code>
   * java edu.stanford.nlp.trees.Treebanks [-maxLength n|-normalize|-treeReaderFactory class] filePath [numberRanges]
   * </code>
   *
   * @param args Array of command-line arguments
   * @throws java.io.IOException If there is a treebank file access problem
   */
public static void main(String[] args) throws IOException {
    if (args.length == 0) {
        printUsage();
        return;
    }
    int i = 0;
    final int maxLength;
    final int minLength;
    int maxL = Integer.MAX_VALUE;
    int minL = -1;
    boolean normalized = false;
    boolean decimate = false;
    boolean pennPrintTrees = false;
    boolean oneLinePrint = false;
    boolean printTaggedWords = false;
    boolean printWords = false;
    boolean correct = false;
    String annotationOptions = null;
    boolean summary = false;
    boolean timing = false;
    boolean yield = false;
    boolean punct = false;
    boolean sentenceLengths = false;
    boolean countTaggings = false;
    boolean removeCodeTrees = false;
    String decimatePrefix = null;
    String encoding = TreebankLanguagePack.DEFAULT_ENCODING;
    String suffix = Treebank.DEFAULT_TREE_FILE_SUFFIX;
    TreeReaderFactory trf = null;
    TreebankLanguagePack tlp = null;
    List<Predicate<Tree>> filters = new ArrayList<>();
    while (i < args.length && args[i].startsWith("-")) {
        if (args[i].equals("-maxLength") && i + 1 < args.length) {
            maxL = Integer.parseInt(args[i + 1]);
            i += 2;
        } else if (args[i].equals("-minLength") && i + 1 < args.length) {
            minL = Integer.parseInt(args[i + 1]);
            i += 2;
        } else if (args[i].equals("-h") || args[i].equals("-help")) {
            printUsage();
            i++;
        } else if (args[i].equals("-normalized")) {
            normalized = true;
            i += 1;
        } else if (args[i].equalsIgnoreCase("-tlp")) {
            try {
                final Object o = Class.forName(args[i + 1]).newInstance();
                tlp = (TreebankLanguagePack) o;
                trf = tlp.treeReaderFactory();
            } catch (Exception e) {
                log.info("Couldn't instantiate as TreebankLanguagePack: " + args[i + 1]);
                return;
            }
            i += 2;
        } else if (args[i].equals("-treeReaderFactory") || args[i].equals("-trf")) {
            try {
                final Object o = Class.forName(args[i + 1]).newInstance();
                trf = (TreeReaderFactory) o;
            } catch (Exception e) {
                log.info("Couldn't instantiate as TreeReaderFactory: " + args[i + 1]);
                return;
            }
            i += 2;
        } else if (args[i].equals("-suffix")) {
            suffix = args[i + 1];
            i += 2;
        } else if (args[i].equals("-decimate")) {
            decimate = true;
            decimatePrefix = args[i + 1];
            i += 2;
        } else if (args[i].equals("-encoding")) {
            encoding = args[i + 1];
            i += 2;
        } else if (args[i].equals("-correct")) {
            correct = true;
            i += 1;
        } else if (args[i].equals("-summary")) {
            summary = true;
            i += 1;
        } else if (args[i].equals("-yield")) {
            yield = true;
            i += 1;
        } else if (args[i].equals("-punct")) {
            punct = true;
            i += 1;
        } else if (args[i].equals("-pennPrint")) {
            pennPrintTrees = true;
            i++;
        } else if (args[i].equals("-oneLine")) {
            oneLinePrint = true;
            i++;
        } else if (args[i].equals("-taggedWords")) {
            printTaggedWords = true;
            i++;
        } else if (args[i].equals("-words")) {
            printWords = true;
            i++;
        } else if (args[i].equals("-annotate")) {
            annotationOptions = args[i + 1];
            i += 2;
        } else if (args[i].equals("-timing")) {
            timing = true;
            i++;
        } else if (args[i].equals("-countTaggings")) {
            countTaggings = true;
            i++;
        } else if (args[i].equals("-sentenceLengths")) {
            sentenceLengths = true;
            i++;
        } else if (args[i].equals("-removeCodeTrees")) {
            removeCodeTrees = true;
            i++;
        } else if (args[i].equals("-filter")) {
            Predicate<Tree> filter = ReflectionLoading.loadByReflection(args[i + 1]);
            filters.add(filter);
            i += 2;
        } else {
            log.info("Unknown option: " + args[i]);
            i++;
        }
    }
    maxLength = maxL;
    minLength = minL;
    Treebank treebank;
    if (trf == null) {
        trf = in -> new PennTreeReader(in, new LabeledScoredTreeFactory());
    }
    if (normalized) {
        treebank = new DiskTreebank();
    } else {
        treebank = new DiskTreebank(trf, encoding);
    }
    for (Predicate<Tree> filter : filters) {
        treebank = new FilteringTreebank(treebank, filter);
    }
    final PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);
    if (i + 1 < args.length) {
        treebank.loadPath(args[i], new NumberRangesFileFilter(args[i + 1], true));
    } else if (i < args.length) {
        treebank.loadPath(args[i], suffix, true);
    } else {
        printUsage();
        return;
    }
    if (annotationOptions != null) {
        // todo Not yet implemented
        log.info("annotationOptions not yet implemented");
    }
    if (summary) {
        System.out.println(treebank.textualSummary());
    }
    if (sentenceLengths) {
        sentenceLengths(treebank, args[i], ((i + 1) < args.length ? args[i + 1] : null), pw);
    }
    if (punct) {
        printPunct(treebank, tlp, pw);
    }
    if (correct) {
        treebank = new EnglishPTBTreebankCorrector().transformTrees(treebank);
    }
    if (pennPrintTrees) {
        treebank.apply(tree -> {
            int length = tree.yield().size();
            if (length >= minLength && length <= maxLength) {
                tree.pennPrint(pw);
                pw.println();
            }
        });
    }
    if (oneLinePrint) {
        treebank.apply(tree -> {
            int length = tree.yield().size();
            if (length >= minLength && length <= maxLength) {
                pw.println(tree);
            }
        });
    }
    if (printWords) {
        final TreeNormalizer tn = new BobChrisTreeNormalizer();
        treebank.apply(tree -> {
            Tree tPrime = tn.normalizeWholeTree(tree, tree.treeFactory());
            int length = tPrime.yield().size();
            if (length >= minLength && length <= maxLength) {
                pw.println(SentenceUtils.listToString(tPrime.taggedYield()));
            }
        });
    }
    if (printTaggedWords) {
        final TreeNormalizer tn = new BobChrisTreeNormalizer();
        treebank.apply(tree -> {
            Tree tPrime = tn.normalizeWholeTree(tree, tree.treeFactory());
            pw.println(SentenceUtils.listToString(tPrime.taggedYield(), false, "_"));
        });
    }
    if (countTaggings) {
        countTaggings(treebank, pw);
    }
    if (yield) {
        treebank.apply(tree -> {
            int length = tree.yield().size();
            if (length >= minLength && length <= maxLength) {
                pw.println(SentenceUtils.listToString(tree.yield()));
            }
        });
    }
    if (decimate) {
        Writer w1 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-train.txt"), encoding));
        Writer w2 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-dev.txt"), encoding));
        Writer w3 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-test.txt"), encoding));
        treebank.decimate(w1, w2, w3);
    }
    if (timing) {
        runTiming(treebank);
    }
    if (removeCodeTrees) {
        // this is a bit of a hack. It only works on an individual file
        if (new File(args[i]).isDirectory()) {
            throw new RuntimeException("-removeCodeTrees only works on a single file");
        }
        String treebankStr = IOUtils.slurpFile(args[i]);
        treebankStr = treebankStr.replaceAll("\\( \\(CODE <[^>]+>\\)\\)", "");
        Writer w = new OutputStreamWriter(new FileOutputStream(args[i]), encoding);
        w.write(treebankStr);
        w.close();
    }
}
Also used : NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) Predicate(java.util.function.Predicate)

Example 5 with NumberRangesFileFilter

use of edu.stanford.nlp.io.NumberRangesFileFilter in project CoreNLP by stanfordnlp.

the class ChineseCharacterBasedLexiconTraining method main.

public static void main(String[] args) throws IOException {
    Map<String, Integer> flagsToNumArgs = Generics.newHashMap();
    flagsToNumArgs.put("-parser", Integer.valueOf(3));
    flagsToNumArgs.put("-lex", Integer.valueOf(3));
    flagsToNumArgs.put("-test", Integer.valueOf(2));
    flagsToNumArgs.put("-out", Integer.valueOf(1));
    flagsToNumArgs.put("-lengthPenalty", Integer.valueOf(1));
    flagsToNumArgs.put("-penaltyType", Integer.valueOf(1));
    flagsToNumArgs.put("-maxLength", Integer.valueOf(1));
    flagsToNumArgs.put("-stats", Integer.valueOf(2));
    Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs);
    boolean eval = argMap.containsKey("-eval");
    PrintWriter pw = null;
    if (argMap.containsKey("-out")) {
        pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap.get("-out"))[0]), "GB18030"), true);
    }
    log.info("ChineseCharacterBasedLexicon called with args:");
    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    for (int i = 0; i < args.length; i++) {
        ctpp.setOptionFlag(args, i);
        log.info(" " + args[i]);
    }
    log.info();
    Options op = new Options(ctpp);
    if (argMap.containsKey("-stats")) {
        String[] statArgs = (argMap.get("-stats"));
        MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
        FileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false);
        rawTrainTreebank.loadPath(new File(statArgs[0]), trainFilt);
        log.info("Done reading trees.");
        MemoryTreebank trainTreebank;
        if (argMap.containsKey("-annotate")) {
            trainTreebank = new MemoryTreebank();
            TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
            for (Tree tree : rawTrainTreebank) {
                trainTreebank.add(annotator.transformTree(tree));
            }
            log.info("Done annotating trees.");
        } else {
            trainTreebank = rawTrainTreebank;
        }
        printStats(trainTreebank, pw);
        System.exit(0);
    }
    int maxLength = 1000000;
    //    Test.verbose = true;
    if (argMap.containsKey("-norm")) {
        op.testOptions.lengthNormalization = true;
    }
    if (argMap.containsKey("-maxLength")) {
        maxLength = Integer.parseInt((argMap.get("-maxLength"))[0]);
    }
    op.testOptions.maxLength = 120;
    boolean combo = argMap.containsKey("-combo");
    if (combo) {
        ctpp.useCharacterBasedLexicon = true;
        op.testOptions.maxSpanForTags = 10;
        op.doDep = false;
        op.dcTags = false;
    }
    LexicalizedParser lp = null;
    Lexicon lex = null;
    if (argMap.containsKey("-parser")) {
        String[] parserArgs = (argMap.get("-parser"));
        if (parserArgs.length > 1) {
            FileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
            lp = LexicalizedParser.trainFromTreebank(parserArgs[0], trainFilt, op);
            if (parserArgs.length == 3) {
                String filename = parserArgs[2];
                log.info("Writing parser in serialized format to file " + filename + " ");
                System.err.flush();
                ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
                out.writeObject(lp);
                out.close();
                log.info("done.");
            }
        } else {
            String parserFile = parserArgs[0];
            lp = LexicalizedParser.loadModel(parserFile, op);
        }
        lex = lp.getLexicon();
        op = lp.getOp();
        ctpp = (ChineseTreebankParserParams) op.tlpParams;
    }
    if (argMap.containsKey("-rad")) {
        ctpp.useUnknownCharacterModel = true;
    }
    if (argMap.containsKey("-lengthPenalty")) {
        ctpp.lengthPenalty = Double.parseDouble((argMap.get("-lengthPenalty"))[0]);
    }
    if (argMap.containsKey("-penaltyType")) {
        ctpp.penaltyType = Integer.parseInt((argMap.get("-penaltyType"))[0]);
    }
    if (argMap.containsKey("-lex")) {
        String[] lexArgs = (argMap.get("-lex"));
        if (lexArgs.length > 1) {
            Index<String> wordIndex = new HashIndex<>();
            Index<String> tagIndex = new HashIndex<>();
            lex = ctpp.lex(op, wordIndex, tagIndex);
            MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
            FileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false);
            rawTrainTreebank.loadPath(new File(lexArgs[0]), trainFilt);
            log.info("Done reading trees.");
            MemoryTreebank trainTreebank;
            if (argMap.containsKey("-annotate")) {
                trainTreebank = new MemoryTreebank();
                TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
                for (Tree tree : rawTrainTreebank) {
                    tree = annotator.transformTree(tree);
                    trainTreebank.add(tree);
                }
                log.info("Done annotating trees.");
            } else {
                trainTreebank = rawTrainTreebank;
            }
            lex.initializeTraining(trainTreebank.size());
            lex.train(trainTreebank);
            lex.finishTraining();
            log.info("Done training lexicon.");
            if (lexArgs.length == 3) {
                String filename = lexArgs.length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
                log.info("Writing lexicon in serialized format to file " + filename + " ");
                System.err.flush();
                ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
                out.writeObject(lex);
                out.close();
                log.info("done.");
            }
        } else {
            String lexFile = lexArgs.length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
            log.info("Reading Lexicon from file " + lexFile);
            ObjectInputStream in = IOUtils.readStreamFromString(lexFile);
            try {
                lex = (Lexicon) in.readObject();
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Bad serialized file: " + lexFile);
            }
            in.close();
        }
    }
    if (argMap.containsKey("-test")) {
        boolean segmentWords = ctpp.segment;
        boolean parse = lp != null;
        assert (parse || segmentWords);
        //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
        //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
        WordSegmenter seg = null;
        if (segmentWords) {
            seg = (WordSegmenter) lex;
        }
        String[] testArgs = (argMap.get("-test"));
        MemoryTreebank testTreebank = op.tlpParams.memoryTreebank();
        FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
        testTreebank.loadPath(new File(testArgs[0]), testFilt);
        TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
        TreeTransformer collinizer = ctpp.collinizer();
        WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
        WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
        EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic");
        EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
        List<String> evalTypes = new ArrayList<>(3);
        boolean goodPOS = false;
        if (segmentWords) {
            evalTypes.add(WordCatConstituent.wordType);
            if (ctpp.segmentMarkov && !parse) {
                evalTypes.add(WordCatConstituent.tagType);
                goodPOS = true;
            }
        }
        if (parse) {
            evalTypes.add(WordCatConstituent.tagType);
            evalTypes.add(WordCatConstituent.catType);
            if (combo) {
                evalTypes.add(WordCatConstituent.wordType);
                goodPOS = true;
            }
        }
        TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
        log.info("Testing...");
        for (Tree goldTop : testTreebank) {
            Tree gold = goldTop.firstChild();
            List<HasWord> goldSentence = gold.yieldHasWord();
            if (goldSentence.size() > maxLength) {
                log.info("Skipping sentence; too long: " + goldSentence.size());
                continue;
            } else {
                log.info("Processing sentence; length: " + goldSentence.size());
            }
            List<HasWord> s;
            if (segmentWords) {
                StringBuilder goldCharBuf = new StringBuilder();
                for (HasWord aGoldSentence : goldSentence) {
                    StringLabel word = (StringLabel) aGoldSentence;
                    goldCharBuf.append(word.value());
                }
                String goldChars = goldCharBuf.toString();
                s = seg.segment(goldChars);
            } else {
                s = goldSentence;
            }
            Tree tree;
            if (parse) {
                tree = lp.parseTree(s);
                if (tree == null) {
                    throw new RuntimeException("PARSER RETURNED NULL!!!");
                }
            } else {
                tree = Trees.toFlatTree(s);
                tree = subcategoryStripper.transformTree(tree);
            }
            if (pw != null) {
                if (parse) {
                    tree.pennPrint(pw);
                } else {
                    Iterator sentIter = s.iterator();
                    for (; ; ) {
                        Word word = (Word) sentIter.next();
                        pw.print(word.word());
                        if (sentIter.hasNext()) {
                            pw.print(" ");
                        } else {
                            break;
                        }
                    }
                }
                pw.println();
            }
            if (eval) {
                Collection ourBrackets, goldBrackets;
                ourBrackets = proc.allBrackets(tree);
                goldBrackets = proc.allBrackets(gold);
                if (goodPOS) {
                    ourBrackets.addAll(proc.commonWordTagTypeBrackets(tree, gold));
                    goldBrackets.addAll(proc.commonWordTagTypeBrackets(gold, tree));
                }
                basicEval.eval(ourBrackets, goldBrackets);
                System.out.println("\nScores:");
                basicEval.displayLast();
                Tree collinsTree = collinizer.transformTree(tree);
                Tree collinsGold = collinizer.transformTree(gold);
                ourBrackets = proc.allBrackets(collinsTree);
                goldBrackets = proc.allBrackets(collinsGold);
                if (goodPOS) {
                    ourBrackets.addAll(proc.commonWordTagTypeBrackets(collinsTree, collinsGold));
                    goldBrackets.addAll(proc.commonWordTagTypeBrackets(collinsGold, collinsTree));
                }
                collinsEval.eval(ourBrackets, goldBrackets);
                System.out.println("\nCollinized scores:");
                collinsEval.displayLast();
                System.out.println();
            }
        }
        if (eval) {
            basicEval.display();
            System.out.println();
            collinsEval.display();
        }
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Word(edu.stanford.nlp.ling.Word) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) ArrayList(java.util.ArrayList) ObjectOutputStream(java.io.ObjectOutputStream) StringLabel(edu.stanford.nlp.ling.StringLabel) TreeToBracketProcessor(edu.stanford.nlp.trees.TreeToBracketProcessor) WordSegmenter(edu.stanford.nlp.process.WordSegmenter) Iterator(java.util.Iterator) Tree(edu.stanford.nlp.trees.Tree) MemoryTreebank(edu.stanford.nlp.trees.MemoryTreebank) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) FileFilter(java.io.FileFilter) PrintWriter(java.io.PrintWriter) HasWord(edu.stanford.nlp.ling.HasWord) WordCatEqualityChecker(edu.stanford.nlp.trees.WordCatEqualityChecker) HashIndex(edu.stanford.nlp.util.HashIndex) WordCatEquivalenceClasser(edu.stanford.nlp.trees.WordCatEquivalenceClasser) FileOutputStream(java.io.FileOutputStream) Collection(java.util.Collection) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) ObjectInputStream(java.io.ObjectInputStream)

Aggregations

NumberRangesFileFilter (edu.stanford.nlp.io.NumberRangesFileFilter)11 NumberRangeFileFilter (edu.stanford.nlp.io.NumberRangeFileFilter)3 TaggedWord (edu.stanford.nlp.ling.TaggedWord)3 Tree (edu.stanford.nlp.trees.Tree)3 FileFilter (java.io.FileFilter)3 HasWord (edu.stanford.nlp.ling.HasWord)2 Word (edu.stanford.nlp.ling.Word)2 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)2 Treebank (edu.stanford.nlp.trees.Treebank)2 HashIndex (edu.stanford.nlp.util.HashIndex)2 File (java.io.File)2 StringLabel (edu.stanford.nlp.ling.StringLabel)1 ChineseTreebankParserParams (edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams)1 LexicalizedParser (edu.stanford.nlp.parser.lexparser.LexicalizedParser)1 Options (edu.stanford.nlp.parser.lexparser.Options)1 WordSegmenter (edu.stanford.nlp.process.WordSegmenter)1 EquivalenceClassEval (edu.stanford.nlp.stats.EquivalenceClassEval)1 DiskTreebank (edu.stanford.nlp.trees.DiskTreebank)1 MemoryTreebank (edu.stanford.nlp.trees.MemoryTreebank)1 TreeNormalizer (edu.stanford.nlp.trees.TreeNormalizer)1