Search in sources :

Example 1 with NormalizationMethod

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.WordEmbeddings.NormalizationMethod in project cogcomp-nlp by CogComp.

the class Parameters method readAndLoadConfig.

/**
     * This is the method that does all the work. This populates and returns a
     * {@link ParametersForLbjCode} object, which is then used throughout the codebase.
     *
     * @param rm a populated <code>ResourceManager</code>.
     * @param areWeTraining this value determines whether or not this run will involve training a
     *        model. If we are training, then we make sure there exists a folder in which to put the
     *        trained model. If not, then we make sure the model exists.
     * @return a {@link ParametersForLbjCode} object populated according to the
     *         <code>ResourceManager</code> argument.
     */
public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean areWeTraining) {
    ParametersForLbjCode param = new ParametersForLbjCode();
    try {
        // First check for any empty answers (NOT allowed):
        Enumeration<Object> enumeration = rm.getKeys();
        while (enumeration.hasMoreElements()) {
            String el = (String) enumeration.nextElement();
            if (rm.getString(el).isEmpty()) {
                throw new IllegalArgumentException("Config File Error: parameter " + el + " has no value. Either comment it out (with #), or remove it.");
            }
        }
        param.debug = rm.getDebug();
        // ParametersForLbjCode.currentParameters.debug = param.debug;
        double randomNoiseLevel = rm.getDouble(NerBaseConfigurator.RANDOM_NOISE_LEVEL);
        double omissionRate = rm.getDouble(NerBaseConfigurator.OMISSION_RATE);
        // Required params
        String cFilename = rm.getString(NerBaseConfigurator.MODEL_NAME);
        if (cFilename == null) {
            throw new IllegalArgumentException("Config File Error: Expected value for non-optional '" + NerBaseConfigurator.MODEL_NAME + "'");
        }
        param.configFilename = cFilename;
        if (rm.containsKey("language")) {
            Language lang = Language.getLanguageByCode(rm.getString("language"));
            param.language = lang;
            // becuase it is used in initializing tree gazetteers
            ParametersForLbjCode.currentParameters.language = lang;
        }
        if (rm.containsKey("labelsToAnonymizeInEvaluation")) {
            String labelsToAnonymizeInEvaluation = rm.getString("labelsToAnonymizeInEvaluation");
            param.labelsToAnonymizeInEvaluation = new Vector<>(Arrays.asList(labelsToAnonymizeInEvaluation.split(" ")));
        }
        if (rm.containsKey("labelsToIgnoreInEvaluation")) {
            String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation");
            param.labelsToIgnoreInEvaluation = new Vector<>(Arrays.asList(labelsToIgnoreInEvaluation.split(" ")));
        }
        if (rm.getString("pathToModelFile") == null) {
            throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'pathToModelFile'");
        }
        param.pathToModelFile = rm.getString("pathToModelFile") + "/" + param.configFilename + ".model";
        String modelFile1 = param.pathToModelFile + ".level1";
        File fullModelFile1 = new File(modelFile1);
        boolean file1Exists = fullModelFile1.exists() || IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFile1);
        String modelFile1Lex = param.pathToModelFile + ".level1.lex";
        File fullModelFile1Lex = new File(modelFile1Lex);
        boolean file1LexExists = fullModelFile1Lex.exists() || IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFile1Lex);
        String modelFile2 = param.pathToModelFile + ".level2";
        File fullModelFile2 = new File(modelFile2);
        boolean file2Exists = fullModelFile2.exists() || IOUtilities.existsInClasspath(NETaggerLevel2.class, modelFile2);
        String modelFile2Lex = param.pathToModelFile + ".level2.lex";
        File fullModelFile2Lex = new File(modelFile2Lex);
        boolean file2LexExists = fullModelFile2Lex.exists() || IOUtilities.existsInClasspath(NETaggerLevel2.class, modelFile2Lex);
        if (!file1Exists || !file1LexExists || (rm.containsKey("PredictionsLevel1") && rm.getString("PredictionsLevel1").equals("1") && (!file2Exists || !file2LexExists))) {
            // if we are not training
            if (!areWeTraining) {
                throw new IllegalArgumentException("Config File Error: one of " + param.pathToModelFile + ".level{1,2}[.lex] does not exist.");
            } else {
                // if we are training, we need to have the train directory
                File trainDir = new File(rm.getString("pathToModelFile"));
                if (!trainDir.isDirectory())
                    trainDir.mkdirs();
            }
        }
        String taggingEncodingSchemeString = rm.getString("taggingEncodingScheme");
        if (taggingEncodingSchemeString == null) {
            throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'taggingEncodingScheme'");
        }
        param.taggingEncodingScheme = TextChunkRepresentationManager.EncodingScheme.valueOf(taggingEncodingSchemeString);
        // Optional params
        if (rm.containsKey("auxiliaryModels")) {
            String auxListString = rm.getString("auxiliaryModels");
            // should be a list
            String[] auxModels = auxListString.split("\\t");
            // FIXME: add func so that if auxModels.length is odd, then we have a problem...
            for (int i = 0; i < auxModels.length; i += 2) {
                ResourceManager auxRm = new ResourceManager(auxModels[i]);
                // loading auxiliary
                ParametersForLbjCode aux = readAndLoadConfig(auxRm, false);
                // models, never
                // training
                aux.nameAsAuxFeature = auxModels[i + 1];
                loadClassifierModels(aux);
                param.auxiliaryModels.addElement(aux);
            }
        }
        if (rm.containsKey("normalizeTitleText")) {
            param.normalizeTitleText = Boolean.parseBoolean(rm.getString("normalizeTitleText"));
        }
        if (rm.containsKey("pathToTokenNormalizationData")) {
            param.pathToTokenNormalizationData = rm.getString("pathToTokenNormalizationData");
            TitleTextNormalizer.pathToBrownClusterForWordFrequencies = param.pathToTokenNormalizationData;
        }
        if (rm.containsKey("forceNewSentenceOnLineBreaks")) {
            param.forceNewSentenceOnLineBreaks = Boolean.parseBoolean(rm.getString("forceNewSentenceOnLineBreaks"));
        }
        if (rm.containsKey("sortLexicallyFilesInFolders")) {
            param.sortLexicallyFilesInFolders = Boolean.parseBoolean(rm.getString("sortLexicallyFilesInFolders"));
        }
        if (rm.containsKey("treatAllFilesInFolderAsOneBigDocument")) {
            param.treatAllFilesInFolderAsOneBigDocument = Boolean.parseBoolean(rm.getString("treatAllFilesInFolderAsOneBigDocument"));
        }
        if (rm.containsKey("minConfidencePredictionsLevel1")) {
            param.minConfidencePredictionsLevel1 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel1"));
        }
        if (rm.containsKey("minConfidencePredictionsLevel2")) {
            param.minConfidencePredictionsLevel2 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel2"));
        }
        if (rm.containsKey("learningRatePredictionsLevel1")) {
            param.learningRatePredictionsLevel1 = Double.parseDouble(rm.getString("learningRatePredictionsLevel1"));
        }
        if (rm.containsKey("learningRatePredictionsLevel2")) {
            param.learningRatePredictionsLevel2 = Double.parseDouble(rm.getString("learningRatePredictionsLevel2"));
        }
        if (rm.containsKey("thicknessPredictionsLevel1")) {
            param.thicknessPredictionsLevel1 = Integer.parseInt(rm.getString("thicknessPredictionsLevel1"));
        }
        if (rm.containsKey("thicknessPredictionsLevel2")) {
            param.thicknessPredictionsLevel2 = Integer.parseInt(rm.getString("thicknessPredictionsLevel2"));
        }
        // labelTypes is just a String[]
        if (rm.containsKey("labelTypes")) {
            // split on whitespace
            param.labelTypes = rm.getString("labelTypes").split("\\s+");
        }
        // Inclusion of all the features
        param.featuresToUse = new HashMap<>();
        for (String feature : possibleFeatures) {
            if (rm.containsKey(feature) && rm.getString(feature).equals("1")) {
                logger.debug("Adding feature: {}", feature);
                param.featuresToUse.put(feature, true);
            }
        }
        // Default positive features
        param.featuresToUse.put("TitleNormalization", true);
        param.featuresToUse.put("WordTopicTitleInfo", true);
        // GazetteersFeatures
        if (rm.containsKey("GazetteersFeatures") && rm.getString("GazetteersFeatures").equals("1")) {
            String pathToGazetteersLists = rm.getString("pathToGazetteersLists");
            if (rm.containsKey("FlatGazetteers") && Boolean.parseBoolean(rm.getString("FlatGazetteers"))) {
                logger.info("Loading FlatGazetteers");
                GazetteersFactory.init(5, pathToGazetteersLists, true);
            } else {
                int maxPhraseLength = 5;
                if (rm.containsKey(NerBaseConfigurator.PHRASE_LENGTH))
                    maxPhraseLength = rm.getInt(NerBaseConfigurator.PHRASE_LENGTH);
                GazetteersFactory.init(maxPhraseLength, pathToGazetteersLists, false);
            }
        }
        // WordEmbeddings feature
        String wordEmbeddingDebug = "";
        if (rm.containsKey("WordEmbeddings") && rm.getString("WordEmbeddings").equals("1")) {
            Vector<String> pathsToWordEmbeddings = // list
            getStringVector(rm.getString("pathsToWordEmbeddings").split("\\s+"));
            Vector<Integer> dimensionality = // list
            getIntegerVector(rm.getString("embeddingDimensionalities").split("\\s+"));
            Vector<Integer> wordAppThresEmbeddings = getIntegerVector(rm.getString("minWordAppThresholdsForEmbeddings").split(// list Note: look for minWordAppThresholdsForEmbeddings
            "\\s+"));
            // FIXME: check all others for things like this
            Vector<Boolean> isLowercaseWordEmbeddings = // list
            getBooleanVector(rm.getString("isLowercaseWordEmbeddings").split("\\s+"));
            Vector<Double> normalizationConstantsForEmbeddings = getDoubleVector(rm.getString("normalizationConstantsForEmbeddings").split(// list
            "\\s+"));
            Vector<NormalizationMethod> normalizationMethodsForEmbeddings = // list
            getNMVector(rm.getString("normalizationMethodsForEmbeddings").split("\\s+"));
            // Check that all vectors are the same length
            int standard = pathsToWordEmbeddings.size();
            if (dimensionality.size() != standard || wordAppThresEmbeddings.size() != standard || isLowercaseWordEmbeddings.size() != standard || normalizationConstantsForEmbeddings.size() != standard || normalizationMethodsForEmbeddings.size() != standard) {
                throw new IllegalArgumentException("Config file error: all resources for WordEmbeddings " + "(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, " + "isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, " + "normalizationMethodsForEmbeddings) need to have the same number of parameters.");
            }
            WordEmbeddings.init(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, normalizationMethodsForEmbeddings);
            for (int i = 0; i < pathsToWordEmbeddings.size(); i++) {
                wordEmbeddingDebug += "Words Embeddings resource: \n";
                wordEmbeddingDebug += "\t-Path: " + pathsToWordEmbeddings.elementAt(i) + "\n";
                wordEmbeddingDebug += "\t-Dimensionality=" + dimensionality.elementAt(i) + "\n";
                wordEmbeddingDebug += "\t-WordThres=" + wordAppThresEmbeddings.elementAt(i) + "\n";
                wordEmbeddingDebug += "\t-IsLowercased=" + isLowercaseWordEmbeddings.elementAt(i) + "\n";
            }
        }
        // BrownClusterPaths feature
        String brownDebug = "";
        if (rm.containsKey("BrownClusterPaths") && rm.getString("BrownClusterPaths").equals("1")) {
            Vector<String> pathsToBrownClusters = // list
            getStringVector(rm.getString("pathsToBrownClusters").split("\\s+"));
            Vector<Integer> minWordAppThresholdsForBrownClusters = getIntegerVector(rm.getString("minWordAppThresholdsForBrownClusters").split(// list
            "\\s+"));
            Vector<Boolean> lowercaseBrown = // list
            getBooleanVector(rm.getString("isLowercaseBrownClusters").split("\\s+"));
            // Check that vectors are all the same length
            int standard = pathsToBrownClusters.size();
            if (minWordAppThresholdsForBrownClusters.size() != standard || lowercaseBrown.size() != standard) {
                throw new IllegalArgumentException("Config file error: all resources for BrownClusters " + "(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, " + "isLowercaseBrownClusters) need to have the same number of parameters.");
            }
            BrownClusters.init(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, lowercaseBrown);
            // For output later
            for (int i = 0; i < pathsToBrownClusters.size(); i++) {
                brownDebug += "Brown clusters resource: \n";
                brownDebug += "\t-Path: " + pathsToBrownClusters.elementAt(i) + "\n";
                brownDebug += "\t-WordThres=" + minWordAppThresholdsForBrownClusters.elementAt(i) + "\n";
                brownDebug += "\t-IsLowercased=" + lowercaseBrown.elementAt(i) + "\n";
            }
        }
        param.randomNoiseLevel = randomNoiseLevel;
        param.omissionRate = omissionRate;
        // don't forget that these should be initialized only after we know the target labels
        // and the encoding scheme
        param.patternLabelRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
        param.level1AggregationRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
        param.prevPredictionsLevel1RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
        param.prevPredictionsLevel2RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
    } catch (IOException e) {
        e.printStackTrace();
    }
    return param;
}
Also used : Language(edu.illinois.cs.cogcomp.core.constants.Language) NETaggerLevel2(edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel2) NETaggerLevel1(edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel1) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) IOException(java.io.IOException) File(java.io.File) OutFile(edu.illinois.cs.cogcomp.ner.IO.OutFile) NormalizationMethod(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.WordEmbeddings.NormalizationMethod)

Aggregations

Language (edu.illinois.cs.cogcomp.core.constants.Language)1 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)1 NormalizationMethod (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.WordEmbeddings.NormalizationMethod)1 OutFile (edu.illinois.cs.cogcomp.ner.IO.OutFile)1 NETaggerLevel1 (edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel1)1 NETaggerLevel2 (edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel2)1 File (java.io.File)1 IOException (java.io.IOException)1