use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.WordEmbeddings.NormalizationMethod in project cogcomp-nlp by CogComp.
the class Parameters method readAndLoadConfig.
/**
* This is the method that does all the work. This populates and returns a
* {@link ParametersForLbjCode} object, which is then used throughout the codebase.
*
* @param rm a populated <code>ResourceManager</code>.
* @param areWeTraining this value determines whether or not this run will involve training a
* model. If we are training, then we make sure there exists a folder in which to put the
* trained model. If not, then we make sure the model exists.
* @return a {@link ParametersForLbjCode} object populated according to the
* <code>ResourceManager</code> argument.
*/
public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean areWeTraining) {
ParametersForLbjCode param = new ParametersForLbjCode();
try {
// First check for any empty answers (NOT allowed):
Enumeration<Object> enumeration = rm.getKeys();
while (enumeration.hasMoreElements()) {
String el = (String) enumeration.nextElement();
if (rm.getString(el).isEmpty()) {
throw new IllegalArgumentException("Config File Error: parameter " + el + " has no value. Either comment it out (with #), or remove it.");
}
}
param.debug = rm.getDebug();
// ParametersForLbjCode.currentParameters.debug = param.debug;
double randomNoiseLevel = rm.getDouble(NerBaseConfigurator.RANDOM_NOISE_LEVEL);
double omissionRate = rm.getDouble(NerBaseConfigurator.OMISSION_RATE);
// Required params
String cFilename = rm.getString(NerBaseConfigurator.MODEL_NAME);
if (cFilename == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional '" + NerBaseConfigurator.MODEL_NAME + "'");
}
param.configFilename = cFilename;
if (rm.containsKey("language")) {
Language lang = Language.getLanguageByCode(rm.getString("language"));
param.language = lang;
// becuase it is used in initializing tree gazetteers
ParametersForLbjCode.currentParameters.language = lang;
}
if (rm.containsKey("labelsToAnonymizeInEvaluation")) {
String labelsToAnonymizeInEvaluation = rm.getString("labelsToAnonymizeInEvaluation");
param.labelsToAnonymizeInEvaluation = new Vector<>(Arrays.asList(labelsToAnonymizeInEvaluation.split(" ")));
}
if (rm.containsKey("labelsToIgnoreInEvaluation")) {
String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation");
param.labelsToIgnoreInEvaluation = new Vector<>(Arrays.asList(labelsToIgnoreInEvaluation.split(" ")));
}
if (rm.getString("pathToModelFile") == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'pathToModelFile'");
}
param.pathToModelFile = rm.getString("pathToModelFile") + "/" + param.configFilename + ".model";
String modelFile1 = param.pathToModelFile + ".level1";
File fullModelFile1 = new File(modelFile1);
boolean file1Exists = fullModelFile1.exists() || IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFile1);
String modelFile1Lex = param.pathToModelFile + ".level1.lex";
File fullModelFile1Lex = new File(modelFile1Lex);
boolean file1LexExists = fullModelFile1Lex.exists() || IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFile1Lex);
String modelFile2 = param.pathToModelFile + ".level2";
File fullModelFile2 = new File(modelFile2);
boolean file2Exists = fullModelFile2.exists() || IOUtilities.existsInClasspath(NETaggerLevel2.class, modelFile2);
String modelFile2Lex = param.pathToModelFile + ".level2.lex";
File fullModelFile2Lex = new File(modelFile2Lex);
boolean file2LexExists = fullModelFile2Lex.exists() || IOUtilities.existsInClasspath(NETaggerLevel2.class, modelFile2Lex);
if (!file1Exists || !file1LexExists || (rm.containsKey("PredictionsLevel1") && rm.getString("PredictionsLevel1").equals("1") && (!file2Exists || !file2LexExists))) {
// if we are not training
if (!areWeTraining) {
throw new IllegalArgumentException("Config File Error: one of " + param.pathToModelFile + ".level{1,2}[.lex] does not exist.");
} else {
// if we are training, we need to have the train directory
File trainDir = new File(rm.getString("pathToModelFile"));
if (!trainDir.isDirectory())
trainDir.mkdirs();
}
}
String taggingEncodingSchemeString = rm.getString("taggingEncodingScheme");
if (taggingEncodingSchemeString == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'taggingEncodingScheme'");
}
param.taggingEncodingScheme = TextChunkRepresentationManager.EncodingScheme.valueOf(taggingEncodingSchemeString);
// Optional params
if (rm.containsKey("auxiliaryModels")) {
String auxListString = rm.getString("auxiliaryModels");
// should be a list
String[] auxModels = auxListString.split("\\t");
// FIXME: add func so that if auxModels.length is odd, then we have a problem...
for (int i = 0; i < auxModels.length; i += 2) {
ResourceManager auxRm = new ResourceManager(auxModels[i]);
// loading auxiliary
ParametersForLbjCode aux = readAndLoadConfig(auxRm, false);
// models, never
// training
aux.nameAsAuxFeature = auxModels[i + 1];
loadClassifierModels(aux);
param.auxiliaryModels.addElement(aux);
}
}
if (rm.containsKey("normalizeTitleText")) {
param.normalizeTitleText = Boolean.parseBoolean(rm.getString("normalizeTitleText"));
}
if (rm.containsKey("pathToTokenNormalizationData")) {
param.pathToTokenNormalizationData = rm.getString("pathToTokenNormalizationData");
TitleTextNormalizer.pathToBrownClusterForWordFrequencies = param.pathToTokenNormalizationData;
}
if (rm.containsKey("forceNewSentenceOnLineBreaks")) {
param.forceNewSentenceOnLineBreaks = Boolean.parseBoolean(rm.getString("forceNewSentenceOnLineBreaks"));
}
if (rm.containsKey("sortLexicallyFilesInFolders")) {
param.sortLexicallyFilesInFolders = Boolean.parseBoolean(rm.getString("sortLexicallyFilesInFolders"));
}
if (rm.containsKey("treatAllFilesInFolderAsOneBigDocument")) {
param.treatAllFilesInFolderAsOneBigDocument = Boolean.parseBoolean(rm.getString("treatAllFilesInFolderAsOneBigDocument"));
}
if (rm.containsKey("minConfidencePredictionsLevel1")) {
param.minConfidencePredictionsLevel1 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel1"));
}
if (rm.containsKey("minConfidencePredictionsLevel2")) {
param.minConfidencePredictionsLevel2 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel2"));
}
if (rm.containsKey("learningRatePredictionsLevel1")) {
param.learningRatePredictionsLevel1 = Double.parseDouble(rm.getString("learningRatePredictionsLevel1"));
}
if (rm.containsKey("learningRatePredictionsLevel2")) {
param.learningRatePredictionsLevel2 = Double.parseDouble(rm.getString("learningRatePredictionsLevel2"));
}
if (rm.containsKey("thicknessPredictionsLevel1")) {
param.thicknessPredictionsLevel1 = Integer.parseInt(rm.getString("thicknessPredictionsLevel1"));
}
if (rm.containsKey("thicknessPredictionsLevel2")) {
param.thicknessPredictionsLevel2 = Integer.parseInt(rm.getString("thicknessPredictionsLevel2"));
}
// labelTypes is just a String[]
if (rm.containsKey("labelTypes")) {
// split on whitespace
param.labelTypes = rm.getString("labelTypes").split("\\s+");
}
// Inclusion of all the features
param.featuresToUse = new HashMap<>();
for (String feature : possibleFeatures) {
if (rm.containsKey(feature) && rm.getString(feature).equals("1")) {
logger.debug("Adding feature: {}", feature);
param.featuresToUse.put(feature, true);
}
}
// Default positive features
param.featuresToUse.put("TitleNormalization", true);
param.featuresToUse.put("WordTopicTitleInfo", true);
// GazetteersFeatures
if (rm.containsKey("GazetteersFeatures") && rm.getString("GazetteersFeatures").equals("1")) {
String pathToGazetteersLists = rm.getString("pathToGazetteersLists");
if (rm.containsKey("FlatGazetteers") && Boolean.parseBoolean(rm.getString("FlatGazetteers"))) {
logger.info("Loading FlatGazetteers");
GazetteersFactory.init(5, pathToGazetteersLists, true);
} else {
int maxPhraseLength = 5;
if (rm.containsKey(NerBaseConfigurator.PHRASE_LENGTH))
maxPhraseLength = rm.getInt(NerBaseConfigurator.PHRASE_LENGTH);
GazetteersFactory.init(maxPhraseLength, pathToGazetteersLists, false);
}
}
// WordEmbeddings feature
String wordEmbeddingDebug = "";
if (rm.containsKey("WordEmbeddings") && rm.getString("WordEmbeddings").equals("1")) {
Vector<String> pathsToWordEmbeddings = // list
getStringVector(rm.getString("pathsToWordEmbeddings").split("\\s+"));
Vector<Integer> dimensionality = // list
getIntegerVector(rm.getString("embeddingDimensionalities").split("\\s+"));
Vector<Integer> wordAppThresEmbeddings = getIntegerVector(rm.getString("minWordAppThresholdsForEmbeddings").split(// list Note: look for minWordAppThresholdsForEmbeddings
"\\s+"));
// FIXME: check all others for things like this
Vector<Boolean> isLowercaseWordEmbeddings = // list
getBooleanVector(rm.getString("isLowercaseWordEmbeddings").split("\\s+"));
Vector<Double> normalizationConstantsForEmbeddings = getDoubleVector(rm.getString("normalizationConstantsForEmbeddings").split(// list
"\\s+"));
Vector<NormalizationMethod> normalizationMethodsForEmbeddings = // list
getNMVector(rm.getString("normalizationMethodsForEmbeddings").split("\\s+"));
// Check that all vectors are the same length
int standard = pathsToWordEmbeddings.size();
if (dimensionality.size() != standard || wordAppThresEmbeddings.size() != standard || isLowercaseWordEmbeddings.size() != standard || normalizationConstantsForEmbeddings.size() != standard || normalizationMethodsForEmbeddings.size() != standard) {
throw new IllegalArgumentException("Config file error: all resources for WordEmbeddings " + "(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, " + "isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, " + "normalizationMethodsForEmbeddings) need to have the same number of parameters.");
}
WordEmbeddings.init(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, normalizationMethodsForEmbeddings);
for (int i = 0; i < pathsToWordEmbeddings.size(); i++) {
wordEmbeddingDebug += "Words Embeddings resource: \n";
wordEmbeddingDebug += "\t-Path: " + pathsToWordEmbeddings.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-Dimensionality=" + dimensionality.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-WordThres=" + wordAppThresEmbeddings.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-IsLowercased=" + isLowercaseWordEmbeddings.elementAt(i) + "\n";
}
}
// BrownClusterPaths feature
String brownDebug = "";
if (rm.containsKey("BrownClusterPaths") && rm.getString("BrownClusterPaths").equals("1")) {
Vector<String> pathsToBrownClusters = // list
getStringVector(rm.getString("pathsToBrownClusters").split("\\s+"));
Vector<Integer> minWordAppThresholdsForBrownClusters = getIntegerVector(rm.getString("minWordAppThresholdsForBrownClusters").split(// list
"\\s+"));
Vector<Boolean> lowercaseBrown = // list
getBooleanVector(rm.getString("isLowercaseBrownClusters").split("\\s+"));
// Check that vectors are all the same length
int standard = pathsToBrownClusters.size();
if (minWordAppThresholdsForBrownClusters.size() != standard || lowercaseBrown.size() != standard) {
throw new IllegalArgumentException("Config file error: all resources for BrownClusters " + "(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, " + "isLowercaseBrownClusters) need to have the same number of parameters.");
}
BrownClusters.init(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, lowercaseBrown);
// For output later
for (int i = 0; i < pathsToBrownClusters.size(); i++) {
brownDebug += "Brown clusters resource: \n";
brownDebug += "\t-Path: " + pathsToBrownClusters.elementAt(i) + "\n";
brownDebug += "\t-WordThres=" + minWordAppThresholdsForBrownClusters.elementAt(i) + "\n";
brownDebug += "\t-IsLowercased=" + lowercaseBrown.elementAt(i) + "\n";
}
}
param.randomNoiseLevel = randomNoiseLevel;
param.omissionRate = omissionRate;
// don't forget that these should be initialized only after we know the target labels
// and the encoding scheme
param.patternLabelRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.level1AggregationRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.prevPredictionsLevel1RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.prevPredictionsLevel2RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
} catch (IOException e) {
e.printStackTrace();
}
return param;
}
Aggregations