Search in sources :

Example 81 with PrintWriter

use of java.io.PrintWriter in project CoreNLP by stanfordnlp.

the class QNMinimizer method minimize.

public double[] minimize(DiffFunction dFunction, double functionTolerance, double[] initial, int maxFunctionEvaluations, QNInfo qn) {
    if (mem > 0) {
        sayln("QNMinimizer called on double function of " + dFunction.domainDimension() + " variables, using M = " + mem + '.');
    } else {
        sayln("QNMinimizer called on double function of " + dFunction.domainDimension() + " variables, using dynamic setting of M.");
    }
    if (qn == null && presetInfo == null) {
        qn = new QNInfo(mem);
        noHistory = true;
    } else if (presetInfo != null) {
        qn = presetInfo;
        noHistory = false;
    } else if (qn != null) {
        noHistory = false;
    }
    its = 0;
    fevals = 0;
    success = false;
    qn.scaleOpt = scaleOpt;
    // initialize weights
    double[] x = initial;
    // initialize gradient
    double[] rawGrad = new double[x.length];
    double[] newGrad = new double[x.length];
    double[] newX = new double[x.length];
    double[] dir = new double[x.length];
    // initialize function value and gradient (gradient is stored in grad inside
    // evaluateFunction)
    double value = evaluateFunction(dFunction, x, rawGrad);
    double[] grad;
    if (useOWLQN) {
        double norm = l1NormOWL(x, dFunction);
        value += norm * lambdaOWL;
        // step (1) in Galen & Gao except we are not computing v yet
        grad = pseudoGradientOWL(x, rawGrad, dFunction);
    } else {
        grad = rawGrad;
    }
    PrintWriter outFile = null;
    PrintWriter infoFile = null;
    if (outputToFile) {
        try {
            String baseName = "QN_m" + mem + '_' + lsOpt.toString() + '_' + scaleOpt.toString();
            outFile = new PrintWriter(new FileOutputStream(baseName + ".output"), true);
            infoFile = new PrintWriter(new FileOutputStream(baseName + ".info"), true);
            infoFile.println(dFunction.domainDimension() + "; DomainDimension ");
            infoFile.println(mem + "; memory");
        } catch (IOException e) {
            throw new RuntimeIOException("Caught IOException outputting QN data to file", e);
        }
    }
    Record rec = new Record(monitor, functionTolerance, outFile);
    // sets the original gradient and x. Also stores the monitor.
    rec.start(value, rawGrad, x);
    // Check if max Evaluations and Iterations have been provided.
    maxFevals = (maxFunctionEvaluations > 0) ? maxFunctionEvaluations : Integer.MAX_VALUE;
    // maxIterations = (maxIterations > 0) ? maxIterations : Integer.MAX_VALUE;
    sayln("               An explanation of the output:");
    sayln("Iter           The number of iterations");
    sayln("evals          The number of function evaluations");
    sayln("SCALING        <D> Diagonal scaling was used; <I> Scaled Identity");
    sayln("LINESEARCH     [## M steplength]  Minpack linesearch");
    sayln("                   1-Function value was too high");
    sayln("                   2-Value ok, gradient positive, positive curvature");
    sayln("                   3-Value ok, gradient negative, positive curvature");
    sayln("                   4-Value ok, gradient negative, negative curvature");
    sayln("               [.. B]  Backtracking");
    sayln("VALUE          The current function value");
    sayln("TIME           Total elapsed time");
    sayln("|GNORM|        The current norm of the gradient");
    sayln("{RELNORM}      The ratio of the current to initial gradient norms");
    sayln("AVEIMPROVE     The average improvement / current value");
    sayln("EVALSCORE      The last available eval score");
    sayln();
    sayln("Iter ## evals ## <SCALING> [LINESEARCH] VALUE TIME |GNORM| {RELNORM} AVEIMPROVE EVALSCORE");
    StringBuilder sb = new StringBuilder();
    eState state = eState.CONTINUE;
    // Beginning of the loop.
    do {
        try {
            if (!quiet) {
                sayln(sb.toString());
            }
            sb = new StringBuilder();
            boolean doEval = (its >= 0 && its >= startEvaluateIters && evaluateIters > 0 && its % evaluateIters == 0);
            its += 1;
            double newValue;
            sb.append("Iter ").append(its).append(" evals ").append(fevals).append(' ');
            // Compute the search direction
            sb.append('<');
            computeDir(dir, grad, x, qn, dFunction, sb);
            sb.append("> ");
            // sanity check dir
            boolean hasNaNDir = false;
            boolean hasNaNGrad = false;
            for (int i = 0; i < dir.length; i++) {
                if (dir[i] != dir[i])
                    hasNaNDir = true;
                if (grad[i] != grad[i])
                    hasNaNGrad = true;
            }
            if (hasNaNDir && !hasNaNGrad) {
                sayln("(NaN dir likely due to Hessian approx - resetting) ");
                qn.clear();
                // re-compute the search direction
                sb.append('<');
                computeDir(dir, grad, x, qn, dFunction, sb);
                sb.append("> ");
            }
            // perform line search
            sb.append('[');
            // initialized in if/else/switch below
            double[] newPoint;
            if (useOWLQN) {
                // only linear search is allowed for OWL-QN
                newPoint = lineSearchBacktrackOWL(dFunction, dir, x, newX, grad, value, sb);
                sb.append('B');
            } else {
                // switch between line search options.
                switch(lsOpt) {
                    case BACKTRACK:
                        newPoint = lineSearchBacktrack(dFunction, dir, x, newX, grad, value, sb);
                        sb.append('B');
                        break;
                    case MINPACK:
                        newPoint = lineSearchMinPack(dFunction, dir, x, newX, grad, value, functionTolerance, sb);
                        sb.append('M');
                        break;
                    default:
                        throw new IllegalArgumentException("Invalid line search option for QNMinimizer.");
                }
            }
            newValue = newPoint[f];
            sb.append(' ');
            sb.append(nf.format(newPoint[a]));
            sb.append("] ");
            // This shouldn't actually evaluate anything since that should have been
            // done in the lineSearch.
            System.arraycopy(dFunction.derivativeAt(newX), 0, newGrad, 0, newGrad.length);
            // This is where all the s, y updates are applied.
            // step (4) in Galen & Gao 2007
            qn.update(newX, x, newGrad, rawGrad, newPoint[a]);
            if (useOWLQN) {
                System.arraycopy(newGrad, 0, rawGrad, 0, newGrad.length);
                // pseudo gradient
                newGrad = pseudoGradientOWL(newX, newGrad, dFunction);
            }
            double evalScore = Double.NEGATIVE_INFINITY;
            if (doEval) {
                evalScore = doEvaluation(newX);
            }
            // Add the current value and gradient to the records, this also monitors
            // X and writes to output
            rec.add(newValue, newGrad, newX, fevals, evalScore, sb);
            // If you want to call a function and do whatever with the information ...
            if (iterCallbackFunction != null) {
                iterCallbackFunction.callback(newX, its, newValue, newGrad);
            }
            // shift
            value = newValue;
            // double[] temp = x;
            // x = newX;
            // newX = temp;
            System.arraycopy(newX, 0, x, 0, x.length);
            System.arraycopy(newGrad, 0, grad, 0, newGrad.length);
            if (fevals > maxFevals) {
                throw new MaxEvaluationsExceeded("Exceeded in minimize() loop.");
            }
        } catch (SurpriseConvergence s) {
            sayln("QNMinimizer aborted due to surprise convergence");
            break;
        } catch (MaxEvaluationsExceeded m) {
            sayln("QNMinimizer aborted due to maximum number of function evaluations");
            sayln(m.toString());
            sayln("** This is not an acceptable termination of QNMinimizer, consider");
            sayln("** increasing the max number of evaluations, or safeguarding your");
            sayln("** program by checking the QNMinimizer.wasSuccessful() method.");
            break;
        } catch (OutOfMemoryError oome) {
            if (!qn.s.isEmpty()) {
                qn.s.remove(0);
                qn.y.remove(0);
                qn.rho.remove(0);
                sb.append("{Caught OutOfMemory, changing m from ").append(qn.mem).append(" to ").append(qn.s.size()).append("}]");
                qn.mem = qn.s.size();
            } else {
                throw oome;
            }
        }
    } while (// end do while
    (state = rec.toContinue(sb)) == eState.CONTINUE);
    if (evaluateIters > 0) {
        // do final evaluation
        double evalScore = (useEvalImprovement ? doEvaluation(rec.getBest()) : doEvaluation(x));
        sayln("final evalScore is: " + evalScore);
    }
    //
    switch(state) {
        case TERMINATE_GRADNORM:
            sayln("QNMinimizer terminated due to numerically zero gradient: |g| < EPS  max(1,|x|) ");
            success = true;
            break;
        case TERMINATE_RELATIVENORM:
            sayln("QNMinimizer terminated due to sufficient decrease in gradient norms: |g|/|g0| < TOL ");
            success = true;
            break;
        case TERMINATE_AVERAGEIMPROVE:
            sayln("QNMinimizer terminated due to average improvement: | newest_val - previous_val | / |newestVal| < TOL ");
            success = true;
            break;
        case TERMINATE_MAXITR:
            sayln("QNMinimizer terminated due to reached max iteration " + maxItr);
            success = true;
            break;
        case TERMINATE_EVALIMPROVE:
            sayln("QNMinimizer terminated due to no improvement on eval ");
            success = true;
            x = rec.getBest();
            break;
        default:
            log.warn("QNMinimizer terminated without converging");
            success = false;
            break;
    }
    double completionTime = rec.howLong();
    sayln("Total time spent in optimization: " + nfsec.format(completionTime) + 's');
    if (outputToFile) {
        infoFile.println(completionTime + "; Total Time ");
        infoFile.println(fevals + "; Total evaluations");
        infoFile.close();
        outFile.close();
    }
    qn.free();
    return x;
}
Also used : RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) IOException(java.io.IOException) RuntimeIOException(edu.stanford.nlp.io.RuntimeIOException) FileOutputStream(java.io.FileOutputStream) PrintWriter(java.io.PrintWriter)

Example 82 with PrintWriter

use of java.io.PrintWriter in project CoreNLP by stanfordnlp.

the class FactoredParser method main.

/* some documentation for Roger's convenience
 * {pcfg,dep,combo}{PE,DE,TE} are precision/dep/tagging evals for the models

 * parser is the PCFG parser
 * dparser is the dependency parser
 * bparser is the combining parser

 * during testing:
 * tree is the test tree (gold tree)
 * binaryTree is the gold tree binarized
 * tree2b is the best PCFG paser, binarized
 * tree2 is the best PCFG parse (debinarized)
 * tree3 is the dependency parse, binarized
 * tree3db is the dependency parser, debinarized
 * tree4 is the best combo parse, binarized and then debinarized
 * tree4b is the best combo parse, binarized
 */
public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.
    StringUtils.logInvocationString(log, args);
    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;
    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
        if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
            path = args[i + 1];
            i += 2;
        } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
            trainLow = Integer.parseInt(args[i + 1]);
            trainHigh = Integer.parseInt(args[i + 2]);
            i += 3;
        } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
            testLow = Integer.parseInt(args[i + 1]);
            testHigh = Integer.parseInt(args[i + 2]);
            i += 3;
        } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
            serializeFile = args[i + 1];
            i += 2;
        } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
            try {
                op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
            } catch (ClassNotFoundException e) {
                log.info("Class not found: " + args[i + 1]);
                throw new RuntimeException(e);
            } catch (InstantiationException e) {
                log.info("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                log.info("illegal access" + e);
                throw new RuntimeException(e);
            }
            i += 2;
        } else if (args[i].equals("-encoding")) {
            // sets encoding for TreebankLangParserParams
            op.tlpParams.setInputEncoding(args[i + 1]);
            op.tlpParams.setOutputEncoding(args[i + 1]);
            i += 2;
        } else {
            i = op.setOptionOrWarn(args, i);
        }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();
    op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();
    op.testOptions.display();
    op.trainOptions.display();
    op.display();
    op.tlpParams.display();
    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);
    Timing.startTime();
    log.info("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
        Collections.sort(testTreebank, new TreeLengthComparator());
    }
    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");
    log.info("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
        binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    } else {
        binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    }
    CollinsPuncTransformer collinsPuncTransformer = null;
    if (op.trainOptions.collinsPunc) {
        collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<>();
    if (op.trainOptions.selectiveSplit) {
        op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack());
        if (op.trainOptions.deleteSplitters != null) {
            List<String> deleted = new ArrayList<>();
            for (String del : op.trainOptions.deleteSplitters) {
                String baseDel = tlp.basicCategory(del);
                boolean checkBasic = del.equals(baseDel);
                for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) {
                    String elem = it.next();
                    String baseElem = tlp.basicCategory(elem);
                    boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
                    if (delStr) {
                        it.remove();
                        deleted.add(elem);
                    }
                }
            }
            log.info("Removed from vertical splitters: " + deleted);
        }
    }
    if (op.trainOptions.selectivePostSplit) {
        TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
        Treebank annotatedTB = trainTreebank.transform(myTransformer);
        op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack());
    }
    if (op.trainOptions.hSelSplit) {
        binarizer.setDoSelectiveSplit(false);
        for (Tree tree : trainTreebank) {
            if (op.trainOptions.collinsPunc) {
                tree = collinsPuncTransformer.transformTree(tree);
            }
            //tree.pennPrint(tlpParams.pw());
            tree = binarizer.transformTree(tree);
        //binaryTrainTrees.add(tree);
        }
        binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
        if (op.trainOptions.collinsPunc) {
            tree = collinsPuncTransformer.transformTree(tree);
        }
        tree = binarizer.transformTree(tree);
        binaryTrainTrees.add(tree);
    }
    if (op.testOptions.verbose) {
        binarizer.dumpStats();
    }
    List<Tree> binaryTestTrees = new ArrayList<>();
    for (Tree tree : testTreebank) {
        if (op.trainOptions.collinsPunc) {
            tree = collinsPuncTransformer.transformTree(tree);
        }
        tree = binarizer.transformTree(tree);
        binaryTestTrees.add(tree);
    }
    // binarization
    Timing.tick("done.");
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    Index<String> stateIndex = new HashIndex<>();
    // extract grammars
    Extractor<Pair<UnaryGrammar, BinaryGrammar>> bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
    if (op.doPCFG) {
        log.info("Extracting PCFG...");
        Pair<UnaryGrammar, BinaryGrammar> bgug = null;
        if (op.trainOptions.cheatPCFG) {
            List<Tree> allTrees = new ArrayList<>(binaryTrainTrees);
            allTrees.addAll(binaryTestTrees);
            bgug = bgExtractor.extract(allTrees);
        } else {
            bgug = bgExtractor.extract(binaryTrainTrees);
        }
        bg = bgug.second;
        bg.splitRules();
        ug = bgug.first;
        ug.purgeRules();
        Timing.tick("done.");
    }
    log.info("Extracting Lexicon...");
    Index<String> wordIndex = new HashIndex<>();
    Index<String> tagIndex = new HashIndex<>();
    lex = op.tlpParams.lex(op, wordIndex, tagIndex);
    lex.initializeTraining(binaryTrainTrees.size());
    lex.train(binaryTrainTrees);
    lex.finishTraining();
    Timing.tick("done.");
    if (op.doDep) {
        log.info("Extracting Dependencies...");
        binaryTrainTrees.clear();
        Extractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
        // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true));
        // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
        //dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams));
        //dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams));
        // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
        //uses information whether the words are known or not, discards unknown words
        dg = dgExtractor.extract(binaryTrainTrees);
        Timing.tick("done.");
        //System.out.print("Extracting Unknown Word Model...");
        //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
        //Timing.tick("done.");
        System.out.print("Tuning Dependency Model...");
        dg.tune(binaryTestTrees);
        //System.out.println("TUNE DEPS: "+tuneDeps);
        Timing.tick("done.");
    }
    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;
    GrammarProjection gp = new NullGrammarProjection(bg, ug);
    // serialization
    if (serializeFile != null) {
        log.info("Serializing parser...");
        LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
        parser.saveParserToSerialized(serializeFile);
        Timing.tick("done.");
    }
    // test: pcfg-parse and output
    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
        parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
    }
    ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null);
    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null);
    //Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
        bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex);
    }
    Evalb pcfgPE = new Evalb("pcfg  PE", true);
    Evalb comboPE = new Evalb("combo PE", true);
    AbstractEval pcfgCB = new Evalb.CBEval("pcfg  CB", true);
    AbstractEval pcfgTE = new TaggingEval("pcfg  TE");
    AbstractEval comboTE = new TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
    AbstractEval depTE = new TaggingEval("depnd TE");
    AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter());
    AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter());
    if (op.testOptions.evalb) {
        EvalbFormatWriter.initEVALBfiles(op.tlpParams);
    }
    // int[] countByLength = new int[op.testOptions.maxLength+1];
    // Use a reflection ruse, so one can run this without needing the
    // tagger.  Using a function rather than a MaxentTagger means we
    // can distribute a version of the parser that doesn't include the
    // entire tagger.
    Function<List<? extends HasWord>, ArrayList<TaggedWord>> tagger = null;
    if (op.testOptions.preTag) {
        try {
            Class[] argsClass = { String.class };
            Object[] arguments = new Object[] { op.testOptions.taggerSerializedFile };
            tagger = (Function<List<? extends HasWord>, ArrayList<TaggedWord>>) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger").getConstructor(argsClass).newInstance(arguments);
        } catch (Exception e) {
            log.info(e);
            log.info("Warning: No pretagging of sentences will be done.");
        }
    }
    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
        Tree tree = testTreebank.get(tNum);
        int testTreeLen = tree.yield().size();
        if (testTreeLen > op.testOptions.maxLength) {
            continue;
        }
        Tree binaryTree = binaryTestTrees.get(tNum);
        // countByLength[testTreeLen]++;
        System.out.println("-------------------------------------");
        System.out.println("Number: " + (tNum + 1));
        System.out.println("Length: " + testTreeLen);
        //tree.pennPrint(pw);
        // System.out.println("XXXX The binary tree is");
        // binaryTree.pennPrint(pw);
        //System.out.println("Here are the tags in the lexicon:");
        //System.out.println(lex.showTags());
        //System.out.println("Here's the tagnumberer:");
        //System.out.println(Numberer.getGlobalNumberer("tags").toString());
        long timeMil1 = System.currentTimeMillis();
        Timing.tick("Starting parse.");
        if (op.doPCFG) {
            //log.info(op.testOptions.forceTags);
            if (op.testOptions.forceTags) {
                if (tagger != null) {
                    //System.out.println("Using a tagger to set tags");
                    //System.out.println("Tagged sentence as: " + tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
                    parser.parse(addLast(tagger.apply(cutLast(wordify(binaryTree.yield())))));
                } else {
                    //System.out.println("Forcing tags to match input.");
                    parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
                }
            } else {
                // System.out.println("XXXX Parsing " + binaryTree.yield());
                parser.parse(binaryTree.yieldHasWord());
            }
        //Timing.tick("Done with pcfg phase.");
        }
        if (op.doDep) {
            dparser.parse(binaryTree.yieldHasWord());
        //Timing.tick("Done with dependency phase.");
        }
        boolean bothPassed = false;
        if (op.doPCFG && op.doDep) {
            bothPassed = bparser.parse(binaryTree.yieldHasWord());
        //Timing.tick("Done with combination phase.");
        }
        long timeMil2 = System.currentTimeMillis();
        long elapsed = timeMil2 - timeMil1;
        log.info("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
        //System.out.println("PCFG Best Parse:");
        Tree tree2b = null;
        Tree tree2 = null;
        //System.out.println("Got full best parse...");
        if (op.doPCFG) {
            tree2b = parser.getBestParse();
            tree2 = debinarizer.transformTree(tree2b);
        }
        //System.out.println("Debinarized parse...");
        //tree2.pennPrint();
        //System.out.println("DepG Best Parse:");
        Tree tree3 = null;
        Tree tree3db = null;
        if (op.doDep) {
            tree3 = dparser.getBestParse();
            // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
            tree3db = debinarizer.transformTree(tree3);
            tree3.pennPrint(pw);
        }
        //tree.pennPrint();
        //((Tree)binaryTrainTrees.get(tNum)).pennPrint();
        //System.out.println("Combo Best Parse:");
        Tree tree4 = null;
        if (op.doPCFG && op.doDep) {
            try {
                tree4 = bparser.getBestParse();
                if (tree4 == null) {
                    tree4 = tree2b;
                }
            } catch (NullPointerException e) {
                log.info("Blocked, using PCFG parse!");
                tree4 = tree2b;
            }
        }
        if (op.doPCFG && !bothPassed) {
            tree4 = tree2b;
        }
        //tree4.pennPrint();
        if (op.doDep) {
            depDE.evaluate(tree3, binaryTree, pw);
            depTE.evaluate(tree3db, tree, pw);
        }
        TreeTransformer tc = op.tlpParams.collinizer();
        TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
        if (op.doPCFG) {
            // System.out.println("XXXX Best PCFG was: ");
            // tree2.pennPrint();
            // System.out.println("XXXX Transformed best PCFG is: ");
            // tc.transformTree(tree2).pennPrint();
            //System.out.println("True Best Parse:");
            //tree.pennPrint();
            //tc.transformTree(tree).pennPrint();
            pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
            pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
            Tree tree4b = null;
            if (op.doDep) {
                comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
                tree4b = tree4;
                tree4 = debinarizer.transformTree(tree4);
                if (op.nodePrune) {
                    NodePruner np = new NodePruner(parser, debinarizer);
                    tree4 = np.prune(tree4);
                }
                //tree4.pennPrint();
                comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
            }
            //pcfgTE.evaluate(tree2, tree);
            pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
            pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
            if (op.doDep) {
                comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
                comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
            }
            System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));
            //tc.transformTree(tree2).pennPrint();
            tree2.pennPrint(pw);
            if (op.doDep) {
                System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
                // tc.transformTree(tree4).pennPrint(pw);
                tree4.pennPrint(pw);
            }
            System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
            /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
            tree.pennPrint(pw);
        }
        if (op.testOptions.evalb) {
            if (op.doPCFG && op.doDep) {
                EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
            } else if (op.doPCFG) {
                EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
            } else if (op.doDep) {
                EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
            }
        }
    }
    if (op.testOptions.evalb) {
        EvalbFormatWriter.closeEVALBfiles();
    }
    // op.testOptions.display();
    if (op.doPCFG) {
        pcfgPE.display(false, pw);
        System.out.println("Grammar size: " + stateIndex.size());
        pcfgCB.display(false, pw);
        if (op.doDep) {
            comboPE.display(false, pw);
        }
        pcfgTE.display(false, pw);
        pcfgTEnoPunct.display(false, pw);
        if (op.doDep) {
            comboTE.display(false, pw);
            comboTEnoPunct.display(false, pw);
        }
    }
    if (op.doDep) {
        depTE.display(false, pw);
        depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
        comboDE.display(false, pw);
    }
// pcfgPE.printGoodBad();
}
Also used : Treebank(edu.stanford.nlp.trees.Treebank) MemoryTreebank(edu.stanford.nlp.trees.MemoryTreebank) ArrayList(java.util.ArrayList) Tree(edu.stanford.nlp.trees.Tree) TreebankLanguagePack(edu.stanford.nlp.trees.TreebankLanguagePack) ArrayList(java.util.ArrayList) List(java.util.List) TaggingEval(edu.stanford.nlp.parser.metrics.TaggingEval) NumberRangeFileFilter(edu.stanford.nlp.io.NumberRangeFileFilter) Evalb(edu.stanford.nlp.parser.metrics.Evalb) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) UnlabeledAttachmentEval(edu.stanford.nlp.parser.metrics.UnlabeledAttachmentEval) MemoryTreebank(edu.stanford.nlp.trees.MemoryTreebank) PrintWriter(java.io.PrintWriter) Pair(edu.stanford.nlp.util.Pair) HasWord(edu.stanford.nlp.ling.HasWord) AbstractEval(edu.stanford.nlp.parser.metrics.AbstractEval) LeftHeadFinder(edu.stanford.nlp.trees.LeftHeadFinder) TreeLengthComparator(edu.stanford.nlp.trees.TreeLengthComparator) HashIndex(edu.stanford.nlp.util.HashIndex)

Example 83 with PrintWriter

use of java.io.PrintWriter in project CoreNLP by stanfordnlp.

the class ChineseCharacterBasedLexiconTraining method main.

public static void main(String[] args) throws IOException {
    Map<String, Integer> flagsToNumArgs = Generics.newHashMap();
    flagsToNumArgs.put("-parser", Integer.valueOf(3));
    flagsToNumArgs.put("-lex", Integer.valueOf(3));
    flagsToNumArgs.put("-test", Integer.valueOf(2));
    flagsToNumArgs.put("-out", Integer.valueOf(1));
    flagsToNumArgs.put("-lengthPenalty", Integer.valueOf(1));
    flagsToNumArgs.put("-penaltyType", Integer.valueOf(1));
    flagsToNumArgs.put("-maxLength", Integer.valueOf(1));
    flagsToNumArgs.put("-stats", Integer.valueOf(2));
    Map<String, String[]> argMap = StringUtils.argsToMap(args, flagsToNumArgs);
    boolean eval = argMap.containsKey("-eval");
    PrintWriter pw = null;
    if (argMap.containsKey("-out")) {
        pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap.get("-out"))[0]), "GB18030"), true);
    }
    log.info("ChineseCharacterBasedLexicon called with args:");
    ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();
    for (int i = 0; i < args.length; i++) {
        ctpp.setOptionFlag(args, i);
        log.info(" " + args[i]);
    }
    log.info();
    Options op = new Options(ctpp);
    if (argMap.containsKey("-stats")) {
        String[] statArgs = (argMap.get("-stats"));
        MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
        FileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false);
        rawTrainTreebank.loadPath(new File(statArgs[0]), trainFilt);
        log.info("Done reading trees.");
        MemoryTreebank trainTreebank;
        if (argMap.containsKey("-annotate")) {
            trainTreebank = new MemoryTreebank();
            TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
            for (Tree tree : rawTrainTreebank) {
                trainTreebank.add(annotator.transformTree(tree));
            }
            log.info("Done annotating trees.");
        } else {
            trainTreebank = rawTrainTreebank;
        }
        printStats(trainTreebank, pw);
        System.exit(0);
    }
    int maxLength = 1000000;
    //    Test.verbose = true;
    if (argMap.containsKey("-norm")) {
        op.testOptions.lengthNormalization = true;
    }
    if (argMap.containsKey("-maxLength")) {
        maxLength = Integer.parseInt((argMap.get("-maxLength"))[0]);
    }
    op.testOptions.maxLength = 120;
    boolean combo = argMap.containsKey("-combo");
    if (combo) {
        ctpp.useCharacterBasedLexicon = true;
        op.testOptions.maxSpanForTags = 10;
        op.doDep = false;
        op.dcTags = false;
    }
    LexicalizedParser lp = null;
    Lexicon lex = null;
    if (argMap.containsKey("-parser")) {
        String[] parserArgs = (argMap.get("-parser"));
        if (parserArgs.length > 1) {
            FileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
            lp = LexicalizedParser.trainFromTreebank(parserArgs[0], trainFilt, op);
            if (parserArgs.length == 3) {
                String filename = parserArgs[2];
                log.info("Writing parser in serialized format to file " + filename + " ");
                System.err.flush();
                ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
                out.writeObject(lp);
                out.close();
                log.info("done.");
            }
        } else {
            String parserFile = parserArgs[0];
            lp = LexicalizedParser.loadModel(parserFile, op);
        }
        lex = lp.getLexicon();
        op = lp.getOp();
        ctpp = (ChineseTreebankParserParams) op.tlpParams;
    }
    if (argMap.containsKey("-rad")) {
        ctpp.useUnknownCharacterModel = true;
    }
    if (argMap.containsKey("-lengthPenalty")) {
        ctpp.lengthPenalty = Double.parseDouble((argMap.get("-lengthPenalty"))[0]);
    }
    if (argMap.containsKey("-penaltyType")) {
        ctpp.penaltyType = Integer.parseInt((argMap.get("-penaltyType"))[0]);
    }
    if (argMap.containsKey("-lex")) {
        String[] lexArgs = (argMap.get("-lex"));
        if (lexArgs.length > 1) {
            Index<String> wordIndex = new HashIndex<>();
            Index<String> tagIndex = new HashIndex<>();
            lex = ctpp.lex(op, wordIndex, tagIndex);
            MemoryTreebank rawTrainTreebank = op.tlpParams.memoryTreebank();
            FileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false);
            rawTrainTreebank.loadPath(new File(lexArgs[0]), trainFilt);
            log.info("Done reading trees.");
            MemoryTreebank trainTreebank;
            if (argMap.containsKey("-annotate")) {
                trainTreebank = new MemoryTreebank();
                TreeAnnotator annotator = new TreeAnnotator(ctpp.headFinder(), ctpp, op);
                for (Tree tree : rawTrainTreebank) {
                    tree = annotator.transformTree(tree);
                    trainTreebank.add(tree);
                }
                log.info("Done annotating trees.");
            } else {
                trainTreebank = rawTrainTreebank;
            }
            lex.initializeTraining(trainTreebank.size());
            lex.train(trainTreebank);
            lex.finishTraining();
            log.info("Done training lexicon.");
            if (lexArgs.length == 3) {
                String filename = lexArgs.length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
                log.info("Writing lexicon in serialized format to file " + filename + " ");
                System.err.flush();
                ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
                out.writeObject(lex);
                out.close();
                log.info("done.");
            }
        } else {
            String lexFile = lexArgs.length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
            log.info("Reading Lexicon from file " + lexFile);
            ObjectInputStream in = IOUtils.readStreamFromString(lexFile);
            try {
                lex = (Lexicon) in.readObject();
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Bad serialized file: " + lexFile);
            }
            in.close();
        }
    }
    if (argMap.containsKey("-test")) {
        boolean segmentWords = ctpp.segment;
        boolean parse = lp != null;
        assert (parse || segmentWords);
        //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
        //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
        WordSegmenter seg = null;
        if (segmentWords) {
            seg = (WordSegmenter) lex;
        }
        String[] testArgs = (argMap.get("-test"));
        MemoryTreebank testTreebank = op.tlpParams.memoryTreebank();
        FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
        testTreebank.loadPath(new File(testArgs[0]), testFilt);
        TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
        TreeTransformer collinizer = ctpp.collinizer();
        WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
        WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
        EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic");
        EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
        List<String> evalTypes = new ArrayList<>(3);
        boolean goodPOS = false;
        if (segmentWords) {
            evalTypes.add(WordCatConstituent.wordType);
            if (ctpp.segmentMarkov && !parse) {
                evalTypes.add(WordCatConstituent.tagType);
                goodPOS = true;
            }
        }
        if (parse) {
            evalTypes.add(WordCatConstituent.tagType);
            evalTypes.add(WordCatConstituent.catType);
            if (combo) {
                evalTypes.add(WordCatConstituent.wordType);
                goodPOS = true;
            }
        }
        TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
        log.info("Testing...");
        for (Tree goldTop : testTreebank) {
            Tree gold = goldTop.firstChild();
            List<HasWord> goldSentence = gold.yieldHasWord();
            if (goldSentence.size() > maxLength) {
                log.info("Skipping sentence; too long: " + goldSentence.size());
                continue;
            } else {
                log.info("Processing sentence; length: " + goldSentence.size());
            }
            List<HasWord> s;
            if (segmentWords) {
                StringBuilder goldCharBuf = new StringBuilder();
                for (HasWord aGoldSentence : goldSentence) {
                    StringLabel word = (StringLabel) aGoldSentence;
                    goldCharBuf.append(word.value());
                }
                String goldChars = goldCharBuf.toString();
                s = seg.segment(goldChars);
            } else {
                s = goldSentence;
            }
            Tree tree;
            if (parse) {
                tree = lp.parseTree(s);
                if (tree == null) {
                    throw new RuntimeException("PARSER RETURNED NULL!!!");
                }
            } else {
                tree = Trees.toFlatTree(s);
                tree = subcategoryStripper.transformTree(tree);
            }
            if (pw != null) {
                if (parse) {
                    tree.pennPrint(pw);
                } else {
                    Iterator sentIter = s.iterator();
                    for (; ; ) {
                        Word word = (Word) sentIter.next();
                        pw.print(word.word());
                        if (sentIter.hasNext()) {
                            pw.print(" ");
                        } else {
                            break;
                        }
                    }
                }
                pw.println();
            }
            if (eval) {
                Collection ourBrackets, goldBrackets;
                ourBrackets = proc.allBrackets(tree);
                goldBrackets = proc.allBrackets(gold);
                if (goodPOS) {
                    ourBrackets.addAll(proc.commonWordTagTypeBrackets(tree, gold));
                    goldBrackets.addAll(proc.commonWordTagTypeBrackets(gold, tree));
                }
                basicEval.eval(ourBrackets, goldBrackets);
                System.out.println("\nScores:");
                basicEval.displayLast();
                Tree collinsTree = collinizer.transformTree(tree);
                Tree collinsGold = collinizer.transformTree(gold);
                ourBrackets = proc.allBrackets(collinsTree);
                goldBrackets = proc.allBrackets(collinsGold);
                if (goodPOS) {
                    ourBrackets.addAll(proc.commonWordTagTypeBrackets(collinsTree, collinsGold));
                    goldBrackets.addAll(proc.commonWordTagTypeBrackets(collinsGold, collinsTree));
                }
                collinsEval.eval(ourBrackets, goldBrackets);
                System.out.println("\nCollinized scores:");
                collinsEval.displayLast();
                System.out.println();
            }
        }
        if (eval) {
            basicEval.display();
            System.out.println();
            collinsEval.display();
        }
    }
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) TaggedWord(edu.stanford.nlp.ling.TaggedWord) Word(edu.stanford.nlp.ling.Word) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) ArrayList(java.util.ArrayList) ObjectOutputStream(java.io.ObjectOutputStream) StringLabel(edu.stanford.nlp.ling.StringLabel) TreeToBracketProcessor(edu.stanford.nlp.trees.TreeToBracketProcessor) WordSegmenter(edu.stanford.nlp.process.WordSegmenter) Iterator(java.util.Iterator) Tree(edu.stanford.nlp.trees.Tree) MemoryTreebank(edu.stanford.nlp.trees.MemoryTreebank) NumberRangesFileFilter(edu.stanford.nlp.io.NumberRangesFileFilter) FileFilter(java.io.FileFilter) PrintWriter(java.io.PrintWriter) HasWord(edu.stanford.nlp.ling.HasWord) WordCatEqualityChecker(edu.stanford.nlp.trees.WordCatEqualityChecker) HashIndex(edu.stanford.nlp.util.HashIndex) WordCatEquivalenceClasser(edu.stanford.nlp.trees.WordCatEquivalenceClasser) FileOutputStream(java.io.FileOutputStream) Collection(java.util.Collection) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) ObjectInputStream(java.io.ObjectInputStream)

Example 84 with PrintWriter

use of java.io.PrintWriter in project CoreNLP by stanfordnlp.

the class BaseLexicon method printLexStats.

/** Print some statistics about this lexicon. */
public void printLexStats() {
    System.out.println("BaseLexicon statistics");
    System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel());
    // System.out.println("Rules size: " + rules.size());
    System.out.println("Sum of rulesWithWord: " + numRules());
    System.out.println("Tags size: " + tags.size());
    int wsize = words.size();
    System.out.println("Words size: " + wsize);
    // System.out.println("Unseen Sigs size: " + sigs.size() +
    // " [number of unknown equivalence classes]");
    System.out.println("rulesWithWord length: " + rulesWithWord.length + " [should be sum of words + unknown sigs]");
    int[] lengths = new int[STATS_BINS];
    ArrayList<String>[] wArr = new ArrayList[STATS_BINS];
    for (int j = 0; j < STATS_BINS; j++) {
        wArr[j] = new ArrayList<>();
    }
    for (int i = 0; i < rulesWithWord.length; i++) {
        int num = rulesWithWord[i].size();
        if (num > STATS_BINS - 1) {
            num = STATS_BINS - 1;
        }
        lengths[num]++;
        if (wsize <= 20 || num >= STATS_BINS / 2) {
            wArr[num].add(wordIndex.get(i));
        }
    }
    System.out.println("Stats on how many taggings for how many words");
    for (int j = 0; j < STATS_BINS; j++) {
        System.out.print(j + " taggings: " + lengths[j] + " words ");
        if (wsize <= 20 || j >= STATS_BINS / 2) {
            System.out.print(wArr[j]);
        }
        System.out.println();
    }
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(0);
    System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));
    if (wsize < 50 && tags.size() < 10) {
        nf.setMaximumFractionDigits(3);
        StringWriter sw = new StringWriter();
        PrintWriter pw = new PrintWriter(sw);
        pw.println("Tagging probabilities log P(word|tag)");
        for (int t = 0; t < tags.size(); t++) {
            pw.print('\t');
            pw.print(tagIndex.get(t));
        }
        pw.println();
        for (int w = 0; w < wsize; w++) {
            pw.print(wordIndex.get(w));
            pw.print('\t');
            for (int t = 0; t < tags.size(); t++) {
                IntTaggedWord iTW = new IntTaggedWord(w, t);
                pw.print(nf.format(score(iTW, 1, wordIndex.get(w), null)));
                if (t == tags.size() - 1) {
                    pw.println();
                } else
                    pw.print('\t');
            }
        }
        pw.close();
        System.out.println(sw.toString());
    }
}
Also used : StringWriter(java.io.StringWriter) NumberFormat(java.text.NumberFormat) PrintWriter(java.io.PrintWriter) EncodingPrintWriter(edu.stanford.nlp.io.EncodingPrintWriter)

Example 85 with PrintWriter

use of java.io.PrintWriter in project CoreNLP by stanfordnlp.

the class RuleBranchingFactor method main.

public static void main(String[] args) {
    if (args.length < minArgs) {
        System.out.println(usage);
        System.exit(-1);
    }
    // Process command-line options
    Properties options = StringUtils.argsToProperties(args, optionArgDefinitions);
    String fileName = options.getProperty("");
    if (fileName == null || fileName.equals("")) {
        System.out.println(usage);
        System.exit(-1);
    }
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    String encoding = options.getProperty("e", "UTF-8");
    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    DiskTreebank tb = tlpp.diskTreebank();
    tb.loadPath(fileName);
    // Statistics
    Counter<String> binaryRuleTypes = new ClassicCounter<>(20000);
    List<Integer> branchingFactors = new ArrayList<>(20000);
    int nTrees = 0;
    int nUnaryRules = 0;
    int nBinaryRules = 0;
    int binaryBranchingFactors = 0;
    // Read the treebank
    PrintWriter pw = tlpp.pw();
    for (Tree tree : tb) {
        if (tree.value().equals("ROOT")) {
            tree = tree.firstChild();
        }
        ++nTrees;
        for (Tree subTree : tree) {
            if (subTree.isPhrasal()) {
                if (subTree.numChildren() > 1) {
                    ++nBinaryRules;
                    branchingFactors.add(subTree.numChildren());
                    binaryBranchingFactors += subTree.numChildren();
                    binaryRuleTypes.incrementCount(treeToRuleString(subTree));
                } else {
                    ++nUnaryRules;
                }
            }
        }
    }
    double mean = (double) binaryBranchingFactors / (double) nBinaryRules;
    System.out.printf("#trees:\t%d%n", nTrees);
    System.out.printf("#binary:\t%d%n", nBinaryRules);
    System.out.printf("#binary types:\t%d%n", binaryRuleTypes.keySet().size());
    System.out.printf("mean branching:\t%.4f%n", mean);
    System.out.printf("stddev branching:\t%.4f%n", standardDeviation(branchingFactors, mean));
    System.out.printf("rule entropy:\t%.5f%n", Counters.entropy(binaryRuleTypes));
    System.out.printf("#unaries:\t%d%n", nUnaryRules);
}
Also used : DiskTreebank(edu.stanford.nlp.trees.DiskTreebank) ArrayList(java.util.ArrayList) TreebankLangParserParams(edu.stanford.nlp.parser.lexparser.TreebankLangParserParams) Properties(java.util.Properties) Language(edu.stanford.nlp.international.Language) ClassicCounter(edu.stanford.nlp.stats.ClassicCounter) Tree(edu.stanford.nlp.trees.Tree) PrintWriter(java.io.PrintWriter)

Aggregations

PrintWriter (java.io.PrintWriter)4039 StringWriter (java.io.StringWriter)1201 IOException (java.io.IOException)788 File (java.io.File)643 Test (org.junit.Test)512 FileWriter (java.io.FileWriter)318 FileOutputStream (java.io.FileOutputStream)313 OutputStreamWriter (java.io.OutputStreamWriter)278 BufferedReader (java.io.BufferedReader)202 ArrayList (java.util.ArrayList)196 ByteArrayOutputStream (java.io.ByteArrayOutputStream)162 HttpServletResponse (javax.servlet.http.HttpServletResponse)145 InputStreamReader (java.io.InputStreamReader)140 Date (java.util.Date)131 HashMap (java.util.HashMap)130 ServletException (javax.servlet.ServletException)126 BufferedWriter (java.io.BufferedWriter)125 HttpServletRequest (javax.servlet.http.HttpServletRequest)125 FastPrintWriter (com.android.internal.util.FastPrintWriter)124 Map (java.util.Map)118