Search in sources :

Example 1 with CompositeTreeTransformer

use of edu.stanford.nlp.trees.CompositeTreeTransformer in project CoreNLP by stanfordnlp.

the class Options method setOptionFlag.

/**
   * Set an option in this object, based on a String array in the style of
   * commandline flags.  The option is only processed with respect to
   * options directly known by the Options object.
   * Some options (there are many others; see the source code):
   * <ul>
   * <li> <code>-maxLength n</code> set the maximum length sentence to parse (inclusively)
   * <li> <code>-printTT</code> print the training trees in raw, annotated, and annotated+binarized form.  Useful for debugging and other miscellany.
   * <li> <code>-printAnnotated filename</code> use only in conjunction with -printTT.  Redirects printing of annotated training trees to <code>filename</code>.
   * <li> <code>-forceTags</code> when the parser is tested against a set of gold standard trees, use the tagged yield, instead of just the yield, as input.
   * </ul>
   *
   * @param args An array of options arguments, command-line style.  E.g. {"-maxLength", "50"}.
   * @param i The index in args to start at when processing an option
   * @return The index in args of the position after the last element used in
   *      processing this option, or the value i unchanged if a valid option couldn't
   *      be processed starting at position i.
   */
protected int setOptionFlag(String[] args, int i) {
    if (args[i].equalsIgnoreCase("-PCFG")) {
        doDep = false;
        doPCFG = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-dep")) {
        doDep = true;
        doPCFG = false;
        i++;
    } else if (args[i].equalsIgnoreCase("-factored")) {
        doDep = true;
        doPCFG = true;
        testOptions.useFastFactored = false;
        i++;
    } else if (args[i].equalsIgnoreCase("-fastFactored")) {
        doDep = true;
        doPCFG = true;
        testOptions.useFastFactored = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-noRecoveryTagging")) {
        testOptions.noRecoveryTagging = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-useLexiconToScoreDependencyPwGt")) {
        testOptions.useLexiconToScoreDependencyPwGt = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-useSmoothTagProjection")) {
        useSmoothTagProjection = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-useUnigramWordSmoothing")) {
        useUnigramWordSmoothing = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-useNonProjectiveDependencyParser")) {
        testOptions.useNonProjectiveDependencyParser = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-maxLength") && (i + 1 < args.length)) {
        testOptions.maxLength = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-MAX_ITEMS") && (i + 1 < args.length)) {
        testOptions.MAX_ITEMS = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-trainLength") && (i + 1 < args.length)) {
        // train on only short sentences
        trainOptions.trainLengthLimit = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-lengthNormalization")) {
        testOptions.lengthNormalization = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-iterativeCKY")) {
        testOptions.iterativeCKY = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-vMarkov") && (i + 1 < args.length)) {
        int order = Integer.parseInt(args[i + 1]);
        if (order <= 1) {
            trainOptions.PA = false;
            trainOptions.gPA = false;
        } else if (order == 2) {
            trainOptions.PA = true;
            trainOptions.gPA = false;
        } else if (order >= 3) {
            trainOptions.PA = true;
            trainOptions.gPA = true;
        }
        i += 2;
    } else if (args[i].equalsIgnoreCase("-vSelSplitCutOff") && (i + 1 < args.length)) {
        trainOptions.selectiveSplitCutOff = Double.parseDouble(args[i + 1]);
        trainOptions.selectiveSplit = trainOptions.selectiveSplitCutOff > 0.0;
        i += 2;
    } else if (args[i].equalsIgnoreCase("-vSelPostSplitCutOff") && (i + 1 < args.length)) {
        trainOptions.selectivePostSplitCutOff = Double.parseDouble(args[i + 1]);
        trainOptions.selectivePostSplit = trainOptions.selectivePostSplitCutOff > 0.0;
        i += 2;
    } else if (args[i].equalsIgnoreCase("-deleteSplitters") && (i + 1 < args.length)) {
        String[] toDel = args[i + 1].split(" *, *");
        trainOptions.deleteSplitters = Generics.newHashSet(Arrays.asList(toDel));
        i += 2;
    } else if (args[i].equalsIgnoreCase("-postSplitWithBaseCategory")) {
        trainOptions.postSplitWithBaseCategory = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-vPostMarkov") && (i + 1 < args.length)) {
        int order = Integer.parseInt(args[i + 1]);
        if (order <= 1) {
            trainOptions.postPA = false;
            trainOptions.postGPA = false;
        } else if (order == 2) {
            trainOptions.postPA = true;
            trainOptions.postGPA = false;
        } else if (order >= 3) {
            trainOptions.postPA = true;
            trainOptions.postGPA = true;
        }
        i += 2;
    } else if (args[i].equalsIgnoreCase("-hMarkov") && (i + 1 < args.length)) {
        int order = Integer.parseInt(args[i + 1]);
        if (order >= 0) {
            trainOptions.markovOrder = order;
            trainOptions.markovFactor = true;
        } else {
            trainOptions.markovFactor = false;
        }
        i += 2;
    } else if (args[i].equalsIgnoreCase("-distanceBins") && (i + 1 < args.length)) {
        int numBins = Integer.parseInt(args[i + 1]);
        if (numBins <= 1) {
            distance = false;
        } else if (numBins == 4) {
            distance = true;
            coarseDistance = true;
        } else if (numBins == 5) {
            distance = true;
            coarseDistance = false;
        } else {
            throw new IllegalArgumentException("Invalid value for -distanceBin: " + args[i + 1]);
        }
        i += 2;
    } else if (args[i].equalsIgnoreCase("-noStop")) {
        genStop = false;
        i++;
    } else if (args[i].equalsIgnoreCase("-nonDirectional")) {
        directional = false;
        i++;
    } else if (args[i].equalsIgnoreCase("-depWeight") && (i + 1 < args.length)) {
        testOptions.depWeight = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-printPCFGkBest") && (i + 1 < args.length)) {
        testOptions.printPCFGkBest = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-evalPCFGkBest") && (i + 1 < args.length)) {
        testOptions.evalPCFGkBest = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-printFactoredKGood") && (i + 1 < args.length)) {
        testOptions.printFactoredKGood = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-smoothTagsThresh") && (i + 1 < args.length)) {
        lexOptions.smoothInUnknownsThreshold = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unseenSmooth") && (i + 1 < args.length)) {
        testOptions.unseenSmooth = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-fractionBeforeUnseenCounting") && (i + 1 < args.length)) {
        trainOptions.fractionBeforeUnseenCounting = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-hSelSplitThresh") && (i + 1 < args.length)) {
        trainOptions.HSEL_CUT = Integer.parseInt(args[i + 1]);
        trainOptions.hSelSplit = trainOptions.HSEL_CUT > 0;
        i += 2;
    } else if (args[i].equalsIgnoreCase("-nohSelSplit")) {
        trainOptions.hSelSplit = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-tagPA")) {
        trainOptions.tagPA = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noTagPA")) {
        trainOptions.tagPA = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-tagSelSplitCutOff") && (i + 1 < args.length)) {
        trainOptions.tagSelectiveSplitCutOff = Double.parseDouble(args[i + 1]);
        trainOptions.tagSelectiveSplit = trainOptions.tagSelectiveSplitCutOff > 0.0;
        i += 2;
    } else if (args[i].equalsIgnoreCase("-tagSelPostSplitCutOff") && (i + 1 < args.length)) {
        trainOptions.tagSelectivePostSplitCutOff = Double.parseDouble(args[i + 1]);
        trainOptions.tagSelectivePostSplit = trainOptions.tagSelectivePostSplitCutOff > 0.0;
        i += 2;
    } else if (args[i].equalsIgnoreCase("-noTagSplit")) {
        trainOptions.noTagSplit = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-uwm") && (i + 1 < args.length)) {
        lexOptions.useUnknownWordSignatures = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unknownSuffixSize") && (i + 1 < args.length)) {
        lexOptions.unknownSuffixSize = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unknownPrefixSize") && (i + 1 < args.length)) {
        lexOptions.unknownPrefixSize = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-uwModelTrainer") && (i + 1 < args.length)) {
        lexOptions.uwModelTrainer = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-openClassThreshold") && (i + 1 < args.length)) {
        trainOptions.openClassTypesThreshold = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unary") && i + 1 < args.length) {
        trainOptions.markUnary = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unaryTags")) {
        trainOptions.markUnaryTags = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-mutate")) {
        lexOptions.smartMutation = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-useUnicodeType")) {
        lexOptions.useUnicodeType = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-rightRec")) {
        trainOptions.rightRec = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noRightRec")) {
        trainOptions.rightRec = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-preTag")) {
        testOptions.preTag = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-forceTags")) {
        testOptions.forceTags = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-taggerSerializedFile")) {
        testOptions.taggerSerializedFile = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-forceTagBeginnings")) {
        testOptions.forceTagBeginnings = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noFunctionalForcing")) {
        testOptions.noFunctionalForcing = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-scTags")) {
        dcTags = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-dcTags")) {
        dcTags = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-basicCategoryTagsInDependencyGrammar")) {
        trainOptions.basicCategoryTagsInDependencyGrammar = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-evalb")) {
        testOptions.evalb = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-v") || args[i].equalsIgnoreCase("-verbose")) {
        testOptions.verbose = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-outputFilesDirectory") && i + 1 < args.length) {
        testOptions.outputFilesDirectory = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-outputFilesExtension") && i + 1 < args.length) {
        testOptions.outputFilesExtension = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-outputFilesPrefix") && i + 1 < args.length) {
        testOptions.outputFilesPrefix = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-outputkBestEquivocation") && i + 1 < args.length) {
        testOptions.outputkBestEquivocation = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-writeOutputFiles")) {
        testOptions.writeOutputFiles = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-printAllBestParses")) {
        testOptions.printAllBestParses = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-outputTreeFormat") || args[i].equalsIgnoreCase("-outputFormat")) {
        testOptions.outputFormat = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-outputTreeFormatOptions") || args[i].equalsIgnoreCase("-outputFormatOptions")) {
        testOptions.outputFormatOptions = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-addMissingFinalPunctuation")) {
        testOptions.addMissingFinalPunctuation = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-flexiTag")) {
        lexOptions.flexiTag = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-lexiTag")) {
        lexOptions.flexiTag = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-useSignatureForKnownSmoothing")) {
        lexOptions.useSignatureForKnownSmoothing = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-wordClassesFile")) {
        lexOptions.wordClassesFile = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-compactGrammar")) {
        trainOptions.compactGrammar = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-markFinalStates")) {
        trainOptions.markFinalStates = args[i + 1].equalsIgnoreCase("true");
        i += 2;
    } else if (args[i].equalsIgnoreCase("-leftToRight")) {
        trainOptions.leftToRight = args[i + 1].equals("true");
        i += 2;
    } else if (args[i].equalsIgnoreCase("-cnf")) {
        forceCNF = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-smoothRules")) {
        trainOptions.ruleSmoothing = true;
        trainOptions.ruleSmoothingAlpha = Double.valueOf(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-nodePrune") && i + 1 < args.length) {
        nodePrune = args[i + 1].equalsIgnoreCase("true");
        i += 2;
    } else if (args[i].equalsIgnoreCase("-noDoRecovery")) {
        testOptions.doRecovery = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-acl03chinese")) {
        trainOptions.markovOrder = 1;
        trainOptions.markovFactor = true;
    // no increment
    } else if (args[i].equalsIgnoreCase("-wordFunction")) {
        wordFunction = ReflectionLoading.loadByReflection(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-acl03pcfg")) {
        doDep = false;
        doPCFG = true;
        // lexOptions.smoothInUnknownsThreshold = 30;
        trainOptions.markUnary = 1;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        trainOptions.tagPA = true;
        trainOptions.tagSelectiveSplit = false;
        trainOptions.rightRec = true;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 400.0;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        lexOptions.useUnknownWordSignatures = 2;
        lexOptions.flexiTag = true;
        // DAN: Tag double-counting is BAD for PCFG-only parsing
        dcTags = false;
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-jenny")) {
        doDep = false;
        doPCFG = true;
        // lexOptions.smoothInUnknownsThreshold = 30;
        trainOptions.markUnary = 1;
        trainOptions.PA = false;
        trainOptions.gPA = false;
        trainOptions.tagPA = false;
        trainOptions.tagSelectiveSplit = false;
        trainOptions.rightRec = true;
        trainOptions.selectiveSplit = false;
        //      trainOptions.selectiveSplitCutOff = 400.0;
        trainOptions.markovFactor = false;
        //      trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = false;
        lexOptions.useUnknownWordSignatures = 2;
        lexOptions.flexiTag = true;
        // DAN: Tag double-counting is BAD for PCFG-only parsing
        dcTags = false;
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-goodPCFG")) {
        doDep = false;
        doPCFG = true;
        // op.lexOptions.smoothInUnknownsThreshold = 30;
        trainOptions.markUnary = 1;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        trainOptions.tagPA = true;
        trainOptions.tagSelectiveSplit = false;
        trainOptions.rightRec = true;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 400.0;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        lexOptions.useUnknownWordSignatures = 2;
        lexOptions.flexiTag = true;
        // DAN: Tag double-counting is BAD for PCFG-only parsing
        dcTags = false;
        String[] delSplit = { "-deleteSplitters", "VP^NP,VP^VP,VP^SINV,VP^SQ" };
        if (this.setOptionFlag(delSplit, 0) != 2) {
            log.info("Error processing deleteSplitters");
        }
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-linguisticPCFG")) {
        doDep = false;
        doPCFG = true;
        // op.lexOptions.smoothInUnknownsThreshold = 30;
        trainOptions.markUnary = 1;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        // on at the moment, but iffy
        trainOptions.tagPA = true;
        trainOptions.tagSelectiveSplit = false;
        // not for linguistic
        trainOptions.rightRec = false;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 400.0;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        // different from acl03pcfg
        lexOptions.useUnknownWordSignatures = 5;
        // different from acl03pcfg
        lexOptions.flexiTag = false;
        // DAN: Tag double-counting is BAD for PCFG-only parsing
        dcTags = false;
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-ijcai03")) {
        doDep = true;
        doPCFG = true;
        trainOptions.markUnary = 0;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        trainOptions.tagPA = false;
        trainOptions.tagSelectiveSplit = false;
        trainOptions.rightRec = false;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 300.0;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        /// cdm: May 2005 compacting bad for factored?
        trainOptions.compactGrammar = 0;
        lexOptions.useUnknownWordSignatures = 2;
        lexOptions.flexiTag = false;
        dcTags = true;
    // op.nodePrune = true;  // cdm: May 2005: this doesn't help
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-goodFactored")) {
        doDep = true;
        doPCFG = true;
        trainOptions.markUnary = 0;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        trainOptions.tagPA = false;
        trainOptions.tagSelectiveSplit = false;
        trainOptions.rightRec = false;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 300.0;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        /// cdm: May 2005 compacting bad for factored?
        trainOptions.compactGrammar = 0;
        // different from ijcai03
        lexOptions.useUnknownWordSignatures = 5;
        lexOptions.flexiTag = false;
        dcTags = true;
    // op.nodePrune = true;  // cdm: May 2005: this doesn't help
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-chineseFactored")) {
        // Single counting tag->word rewrite is also much better for Chinese
        // Factored.  Bracketing F1 goes up about 0.7%.
        dcTags = false;
        lexOptions.useUnicodeType = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        trainOptions.markovFactor = true;
        trainOptions.HSEL_CUT = 50;
    // trainOptions.openClassTypesThreshold=1;  // so can get unseen punctuation
    // trainOptions.fractionBeforeUnseenCounting=0.0;  // so can get unseen punctuation
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-arabicFactored")) {
        doDep = true;
        doPCFG = true;
        // "false" seems to help Arabic about 0.1% F1
        dcTags = false;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        // 75 bit better than 50, 100 a bit worse
        trainOptions.HSEL_CUT = 75;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 300.0;
        // Helps PCFG and marginally factLB
        trainOptions.markUnary = 1;
        // trainOptions.compactGrammar = 0;  // Doesn't seem to help or only 0.05% F1
        lexOptions.useUnknownWordSignatures = 9;
        lexOptions.unknownPrefixSize = 1;
        lexOptions.unknownSuffixSize = 1;
        // Arabic sentences are long enough that this helps a fraction
        testOptions.MAX_ITEMS = 500000;
    // don't increment i so it gets language specific stuff as well
    } else if (args[i].equalsIgnoreCase("-frenchFactored")) {
        doDep = true;
        doPCFG = true;
        //wsg2011: Setting to false improves F1 by 0.5%
        dcTags = false;
        trainOptions.markovFactor = true;
        trainOptions.markovOrder = 2;
        trainOptions.hSelSplit = true;
        trainOptions.HSEL_CUT = 75;
        trainOptions.PA = true;
        trainOptions.gPA = false;
        trainOptions.selectiveSplit = true;
        trainOptions.selectiveSplitCutOff = 300.0;
        //Unary rule marking bad for french..setting to 0 gives +0.3 F1
        trainOptions.markUnary = 0;
        lexOptions.useUnknownWordSignatures = 1;
        lexOptions.unknownPrefixSize = 1;
        lexOptions.unknownSuffixSize = 2;
    } else if (args[i].equalsIgnoreCase("-chinesePCFG")) {
        trainOptions.markovOrder = 2;
        trainOptions.markovFactor = true;
        trainOptions.HSEL_CUT = 5;
        trainOptions.PA = true;
        trainOptions.gPA = true;
        trainOptions.selectiveSplit = false;
        doDep = false;
        doPCFG = true;
        // Single counting tag->word rewrite is also much better for Chinese PCFG
        // Bracketing F1 is up about 2% and tag accuracy about 1% (exact by 6%)
        dcTags = false;
    // no increment
    } else if (args[i].equalsIgnoreCase("-printTT") && (i + 1 < args.length)) {
        trainOptions.printTreeTransformations = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-printAnnotatedRuleCounts")) {
        trainOptions.printAnnotatedRuleCounts = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-printAnnotatedStateCounts")) {
        trainOptions.printAnnotatedStateCounts = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-printAnnotated") && (i + 1 < args.length)) {
        try {
            trainOptions.printAnnotatedPW = tlpParams.pw(new FileOutputStream(args[i + 1]));
        } catch (IOException ioe) {
            trainOptions.printAnnotatedPW = null;
        }
        i += 2;
    } else if (args[i].equalsIgnoreCase("-printBinarized") && (i + 1 < args.length)) {
        try {
            trainOptions.printBinarizedPW = tlpParams.pw(new FileOutputStream(args[i + 1]));
        } catch (IOException ioe) {
            trainOptions.printBinarizedPW = null;
        }
        i += 2;
    } else if (args[i].equalsIgnoreCase("-printStates")) {
        trainOptions.printStates = true;
        i++;
    } else if (args[i].equalsIgnoreCase("-preTransformer") && (i + 1 < args.length)) {
        String[] classes = args[i + 1].split(",");
        i += 2;
        if (classes.length == 1) {
            trainOptions.preTransformer = ReflectionLoading.loadByReflection(classes[0], this);
        } else if (classes.length > 1) {
            CompositeTreeTransformer composite = new CompositeTreeTransformer();
            trainOptions.preTransformer = composite;
            for (String clazz : classes) {
                TreeTransformer transformer = ReflectionLoading.loadByReflection(clazz, this);
                composite.addTransformer(transformer);
            }
        }
    } else if (args[i].equalsIgnoreCase("-taggedFiles") && (i + 1 < args.length)) {
        trainOptions.taggedFiles = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-predictSplits")) {
        // This is an experimental (and still in development)
        // reimplementation of Berkeley's state splitting grammar.
        trainOptions.predictSplits = true;
        trainOptions.compactGrammar = 0;
        i++;
    } else if (args[i].equalsIgnoreCase("-splitCount")) {
        trainOptions.splitCount = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-splitRecombineRate")) {
        trainOptions.splitRecombineRate = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-trainingThreads") || args[i].equalsIgnoreCase("-nThreads")) {
        trainOptions.trainingThreads = Integer.parseInt(args[i + 1]);
        testOptions.testingThreads = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-testingThreads")) {
        testOptions.testingThreads = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-evals")) {
        testOptions.evals = StringUtils.stringToProperties(args[i + 1], testOptions.evals);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-fastFactoredCandidateMultiplier")) {
        testOptions.fastFactoredCandidateMultiplier = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-fastFactoredCandidateAddend")) {
        testOptions.fastFactoredCandidateAddend = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-quietEvaluation")) {
        testOptions.quietEvaluation = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noquietEvaluation")) {
        testOptions.quietEvaluation = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-simpleBinarizedLabels")) {
        trainOptions.simpleBinarizedLabels = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noRebinarization")) {
        trainOptions.noRebinarization = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-dvKBest")) {
        trainOptions.dvKBest = Integer.parseInt(args[i + 1]);
        rerankerKBest = trainOptions.dvKBest;
        i += 2;
    } else if (args[i].equalsIgnoreCase("-regCost")) {
        trainOptions.regCost = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-dvIterations") || args[i].equalsIgnoreCase("-trainingIterations")) {
        trainOptions.trainingIterations = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-stalledIterationLimit")) {
        trainOptions.stalledIterationLimit = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-dvBatchSize") || args[i].equalsIgnoreCase("-batchSize")) {
        trainOptions.batchSize = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-qnIterationsPerBatch")) {
        trainOptions.qnIterationsPerBatch = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-qnEstimates")) {
        trainOptions.qnEstimates = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-qnTolerance")) {
        trainOptions.qnTolerance = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-debugOutputFrequency")) {
        trainOptions.debugOutputFrequency = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-maxTrainTimeSeconds")) {
        trainOptions.maxTrainTimeSeconds = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-dvSeed") || args[i].equalsIgnoreCase("-randomSeed")) {
        trainOptions.randomSeed = Long.parseLong(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-wordVectorFile")) {
        lexOptions.wordVectorFile = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-numHid")) {
        lexOptions.numHid = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-learningRate")) {
        trainOptions.learningRate = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-deltaMargin")) {
        trainOptions.deltaMargin = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unknownNumberVector")) {
        trainOptions.unknownNumberVector = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUnknownNumberVector")) {
        trainOptions.unknownNumberVector = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-unknownDashedWordVectors")) {
        trainOptions.unknownDashedWordVectors = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUnknownDashedWordVectors")) {
        trainOptions.unknownDashedWordVectors = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-unknownCapsVector")) {
        trainOptions.unknownCapsVector = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUnknownCapsVector")) {
        trainOptions.unknownCapsVector = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-unknownChineseYearVector")) {
        trainOptions.unknownChineseYearVector = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUnknownChineseYearVector")) {
        trainOptions.unknownChineseYearVector = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-unknownChineseNumberVector")) {
        trainOptions.unknownChineseNumberVector = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUnknownChineseNumberVector")) {
        trainOptions.unknownChineseNumberVector = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-unknownChinesePercentVector")) {
        trainOptions.unknownChinesePercentVector = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUnknownChinesePercentVector")) {
        trainOptions.unknownChinesePercentVector = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-dvSimplifiedModel")) {
        trainOptions.dvSimplifiedModel = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-scalingForInit")) {
        trainOptions.scalingForInit = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-rerankerKBest")) {
        rerankerKBest = Integer.parseInt(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-baseParserWeight")) {
        baseParserWeight = Double.parseDouble(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-unkWord")) {
        trainOptions.unkWord = args[i + 1];
        i += 2;
    } else if (args[i].equalsIgnoreCase("-lowercaseWordVectors")) {
        trainOptions.lowercaseWordVectors = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noLowercaseWordVectors")) {
        trainOptions.lowercaseWordVectors = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-transformMatrixType")) {
        trainOptions.transformMatrixType = TrainOptions.TransformMatrixType.valueOf(args[i + 1]);
        i += 2;
    } else if (args[i].equalsIgnoreCase("-useContextWords")) {
        trainOptions.useContextWords = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noUseContextWords")) {
        trainOptions.useContextWords = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-trainWordVectors")) {
        trainOptions.trainWordVectors = true;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-noTrainWordVectors")) {
        trainOptions.trainWordVectors = false;
        i += 1;
    } else if (args[i].equalsIgnoreCase("-markStrahler")) {
        trainOptions.markStrahler = true;
        i += 1;
    }
    return i;
}
Also used : CompositeTreeTransformer(edu.stanford.nlp.trees.CompositeTreeTransformer) TreeTransformer(edu.stanford.nlp.trees.TreeTransformer) CompositeTreeTransformer(edu.stanford.nlp.trees.CompositeTreeTransformer)

Example 2 with CompositeTreeTransformer

use of edu.stanford.nlp.trees.CompositeTreeTransformer in project CoreNLP by stanfordnlp.

the class ShiftReduceParser method binarizeTreebank.

public static List<Tree> binarizeTreebank(Treebank treebank, Options op) {
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(op.tlpParams.headFinder(), op.tlpParams.treebankLanguagePack());
    BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.langpack());
    CompositeTreeTransformer transformer = new CompositeTreeTransformer();
    transformer.addTransformer(binarizer);
    transformer.addTransformer(basicTransformer);
    treebank = treebank.transform(transformer);
    HeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.headFinder());
    List<Tree> binarizedTrees = Generics.newArrayList();
    for (Tree tree : treebank) {
        Trees.convertToCoreLabels(tree);
        tree.percolateHeadAnnotations(binaryHeadFinder);
        // Index from 1.  Tools downstream expect index from 1, so for
        // uses internal to the srparser we have to renormalize the
        // indices, with the result that here we have to index from 1
        tree.indexLeaves(1, true);
        binarizedTrees.add(tree);
    }
    return binarizedTrees;
}
Also used : BasicCategoryTreeTransformer(edu.stanford.nlp.trees.BasicCategoryTreeTransformer) BinaryHeadFinder(edu.stanford.nlp.parser.lexparser.BinaryHeadFinder) TreeBinarizer(edu.stanford.nlp.parser.lexparser.TreeBinarizer) HeadFinder(edu.stanford.nlp.trees.HeadFinder) BinaryHeadFinder(edu.stanford.nlp.parser.lexparser.BinaryHeadFinder) Tree(edu.stanford.nlp.trees.Tree) CompositeTreeTransformer(edu.stanford.nlp.trees.CompositeTreeTransformer)

Aggregations

CompositeTreeTransformer (edu.stanford.nlp.trees.CompositeTreeTransformer)2 BinaryHeadFinder (edu.stanford.nlp.parser.lexparser.BinaryHeadFinder)1 TreeBinarizer (edu.stanford.nlp.parser.lexparser.TreeBinarizer)1 BasicCategoryTreeTransformer (edu.stanford.nlp.trees.BasicCategoryTreeTransformer)1 HeadFinder (edu.stanford.nlp.trees.HeadFinder)1 Tree (edu.stanford.nlp.trees.Tree)1 TreeTransformer (edu.stanford.nlp.trees.TreeTransformer)1