use of edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager in project cogcomp-nlp by CogComp.
the class SimpleGazetteerAnnotatorTest method setUpBeforeClass.
/**
* @throws java.lang.Exception
*/
@BeforeClass
public static void setUpBeforeClass() throws Exception {
Properties props = new Properties();
props.setProperty(SimpleGazetteerAnnotatorConfigurator.IS_LAZILY_INITIALIZED.key, Configurator.FALSE);
props.setProperty(SimpleGazetteerAnnotatorConfigurator.PATH_TO_DICTIONARIES.key, "/testgazetteers/");
props.setProperty(SimpleGazetteerAnnotatorConfigurator.PHRASE_LENGTH.key, "6");
defaultRm = new ResourceManager(props);
}
use of edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager in project cogcomp-nlp by CogComp.
the class Parameters method readAndLoadConfig.
/**
* This is the method that does all the work. This populates and returns a
* {@link ParametersForLbjCode} object, which is then used throughout the codebase.
*
* @param rm a populated <code>ResourceManager</code>.
* @param areWeTraining this value determines whether or not this run will involve training a
* model. If we are training, then we make sure there exists a folder in which to put the
* trained model. If not, then we make sure the model exists.
* @return a {@link ParametersForLbjCode} object populated according to the
* <code>ResourceManager</code> argument.
*/
public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean areWeTraining) {
ParametersForLbjCode param = new ParametersForLbjCode();
try {
// First check for any empty answers (NOT allowed):
Enumeration<Object> enumeration = rm.getKeys();
while (enumeration.hasMoreElements()) {
String el = (String) enumeration.nextElement();
if (rm.getString(el).isEmpty()) {
throw new IllegalArgumentException("Config File Error: parameter " + el + " has no value. Either comment it out (with #), or remove it.");
}
}
param.debug = rm.getDebug();
// ParametersForLbjCode.currentParameters.debug = param.debug;
double randomNoiseLevel = rm.getDouble(NerBaseConfigurator.RANDOM_NOISE_LEVEL);
double omissionRate = rm.getDouble(NerBaseConfigurator.OMISSION_RATE);
// Required params
String cFilename = rm.getString(NerBaseConfigurator.MODEL_NAME);
if (cFilename == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional '" + NerBaseConfigurator.MODEL_NAME + "'");
}
param.configFilename = cFilename;
if (rm.containsKey("language")) {
Language lang = Language.getLanguageByCode(rm.getString("language"));
param.language = lang;
// becuase it is used in initializing tree gazetteers
ParametersForLbjCode.currentParameters.language = lang;
}
if (rm.containsKey("labelsToAnonymizeInEvaluation")) {
String labelsToAnonymizeInEvaluation = rm.getString("labelsToAnonymizeInEvaluation");
param.labelsToAnonymizeInEvaluation = new Vector<>(Arrays.asList(labelsToAnonymizeInEvaluation.split(" ")));
}
if (rm.containsKey("labelsToIgnoreInEvaluation")) {
String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation");
param.labelsToIgnoreInEvaluation = new Vector<>(Arrays.asList(labelsToIgnoreInEvaluation.split(" ")));
}
if (rm.getString("pathToModelFile") == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'pathToModelFile'");
}
param.pathToModelFile = rm.getString("pathToModelFile") + "/" + param.configFilename + ".model";
String modelFile1 = param.pathToModelFile + ".level1";
File fullModelFile1 = new File(modelFile1);
boolean file1Exists = fullModelFile1.exists() || IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFile1);
String modelFile1Lex = param.pathToModelFile + ".level1.lex";
File fullModelFile1Lex = new File(modelFile1Lex);
boolean file1LexExists = fullModelFile1Lex.exists() || IOUtilities.existsInClasspath(NETaggerLevel1.class, modelFile1Lex);
String modelFile2 = param.pathToModelFile + ".level2";
File fullModelFile2 = new File(modelFile2);
boolean file2Exists = fullModelFile2.exists() || IOUtilities.existsInClasspath(NETaggerLevel2.class, modelFile2);
String modelFile2Lex = param.pathToModelFile + ".level2.lex";
File fullModelFile2Lex = new File(modelFile2Lex);
boolean file2LexExists = fullModelFile2Lex.exists() || IOUtilities.existsInClasspath(NETaggerLevel2.class, modelFile2Lex);
if (!file1Exists || !file1LexExists || (rm.containsKey("PredictionsLevel1") && rm.getString("PredictionsLevel1").equals("1") && (!file2Exists || !file2LexExists))) {
// if we are not training
if (!areWeTraining) {
throw new IllegalArgumentException("Config File Error: one of " + param.pathToModelFile + ".level{1,2}[.lex] does not exist.");
} else {
// if we are training, we need to have the train directory
File trainDir = new File(rm.getString("pathToModelFile"));
if (!trainDir.isDirectory())
trainDir.mkdirs();
}
}
String taggingEncodingSchemeString = rm.getString("taggingEncodingScheme");
if (taggingEncodingSchemeString == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'taggingEncodingScheme'");
}
param.taggingEncodingScheme = TextChunkRepresentationManager.EncodingScheme.valueOf(taggingEncodingSchemeString);
// Optional params
if (rm.containsKey("auxiliaryModels")) {
String auxListString = rm.getString("auxiliaryModels");
// should be a list
String[] auxModels = auxListString.split("\\t");
// FIXME: add func so that if auxModels.length is odd, then we have a problem...
for (int i = 0; i < auxModels.length; i += 2) {
ResourceManager auxRm = new ResourceManager(auxModels[i]);
// loading auxiliary
ParametersForLbjCode aux = readAndLoadConfig(auxRm, false);
// models, never
// training
aux.nameAsAuxFeature = auxModels[i + 1];
loadClassifierModels(aux);
param.auxiliaryModels.addElement(aux);
}
}
if (rm.containsKey("normalizeTitleText")) {
param.normalizeTitleText = Boolean.parseBoolean(rm.getString("normalizeTitleText"));
}
if (rm.containsKey("pathToTokenNormalizationData")) {
param.pathToTokenNormalizationData = rm.getString("pathToTokenNormalizationData");
TitleTextNormalizer.pathToBrownClusterForWordFrequencies = param.pathToTokenNormalizationData;
}
if (rm.containsKey("forceNewSentenceOnLineBreaks")) {
param.forceNewSentenceOnLineBreaks = Boolean.parseBoolean(rm.getString("forceNewSentenceOnLineBreaks"));
}
if (rm.containsKey("sortLexicallyFilesInFolders")) {
param.sortLexicallyFilesInFolders = Boolean.parseBoolean(rm.getString("sortLexicallyFilesInFolders"));
}
if (rm.containsKey("treatAllFilesInFolderAsOneBigDocument")) {
param.treatAllFilesInFolderAsOneBigDocument = Boolean.parseBoolean(rm.getString("treatAllFilesInFolderAsOneBigDocument"));
}
if (rm.containsKey("minConfidencePredictionsLevel1")) {
param.minConfidencePredictionsLevel1 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel1"));
}
if (rm.containsKey("minConfidencePredictionsLevel2")) {
param.minConfidencePredictionsLevel2 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel2"));
}
if (rm.containsKey("learningRatePredictionsLevel1")) {
param.learningRatePredictionsLevel1 = Double.parseDouble(rm.getString("learningRatePredictionsLevel1"));
}
if (rm.containsKey("learningRatePredictionsLevel2")) {
param.learningRatePredictionsLevel2 = Double.parseDouble(rm.getString("learningRatePredictionsLevel2"));
}
if (rm.containsKey("thicknessPredictionsLevel1")) {
param.thicknessPredictionsLevel1 = Integer.parseInt(rm.getString("thicknessPredictionsLevel1"));
}
if (rm.containsKey("thicknessPredictionsLevel2")) {
param.thicknessPredictionsLevel2 = Integer.parseInt(rm.getString("thicknessPredictionsLevel2"));
}
// labelTypes is just a String[]
if (rm.containsKey("labelTypes")) {
// split on whitespace
param.labelTypes = rm.getString("labelTypes").split("\\s+");
}
// Inclusion of all the features
param.featuresToUse = new HashMap<>();
for (String feature : possibleFeatures) {
if (rm.containsKey(feature) && rm.getString(feature).equals("1")) {
logger.debug("Adding feature: {}", feature);
param.featuresToUse.put(feature, true);
}
}
// Default positive features
param.featuresToUse.put("TitleNormalization", true);
param.featuresToUse.put("WordTopicTitleInfo", true);
// GazetteersFeatures
if (rm.containsKey("GazetteersFeatures") && rm.getString("GazetteersFeatures").equals("1")) {
String pathToGazetteersLists = rm.getString("pathToGazetteersLists");
if (rm.containsKey("FlatGazetteers") && Boolean.parseBoolean(rm.getString("FlatGazetteers"))) {
logger.info("Loading FlatGazetteers");
GazetteersFactory.init(5, pathToGazetteersLists, true);
} else {
int maxPhraseLength = 5;
if (rm.containsKey(NerBaseConfigurator.PHRASE_LENGTH))
maxPhraseLength = rm.getInt(NerBaseConfigurator.PHRASE_LENGTH);
GazetteersFactory.init(maxPhraseLength, pathToGazetteersLists, false);
}
}
// WordEmbeddings feature
String wordEmbeddingDebug = "";
if (rm.containsKey("WordEmbeddings") && rm.getString("WordEmbeddings").equals("1")) {
Vector<String> pathsToWordEmbeddings = // list
getStringVector(rm.getString("pathsToWordEmbeddings").split("\\s+"));
Vector<Integer> dimensionality = // list
getIntegerVector(rm.getString("embeddingDimensionalities").split("\\s+"));
Vector<Integer> wordAppThresEmbeddings = getIntegerVector(rm.getString("minWordAppThresholdsForEmbeddings").split(// list Note: look for minWordAppThresholdsForEmbeddings
"\\s+"));
// FIXME: check all others for things like this
Vector<Boolean> isLowercaseWordEmbeddings = // list
getBooleanVector(rm.getString("isLowercaseWordEmbeddings").split("\\s+"));
Vector<Double> normalizationConstantsForEmbeddings = getDoubleVector(rm.getString("normalizationConstantsForEmbeddings").split(// list
"\\s+"));
Vector<NormalizationMethod> normalizationMethodsForEmbeddings = // list
getNMVector(rm.getString("normalizationMethodsForEmbeddings").split("\\s+"));
// Check that all vectors are the same length
int standard = pathsToWordEmbeddings.size();
if (dimensionality.size() != standard || wordAppThresEmbeddings.size() != standard || isLowercaseWordEmbeddings.size() != standard || normalizationConstantsForEmbeddings.size() != standard || normalizationMethodsForEmbeddings.size() != standard) {
throw new IllegalArgumentException("Config file error: all resources for WordEmbeddings " + "(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, " + "isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, " + "normalizationMethodsForEmbeddings) need to have the same number of parameters.");
}
WordEmbeddings.init(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, normalizationMethodsForEmbeddings);
for (int i = 0; i < pathsToWordEmbeddings.size(); i++) {
wordEmbeddingDebug += "Words Embeddings resource: \n";
wordEmbeddingDebug += "\t-Path: " + pathsToWordEmbeddings.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-Dimensionality=" + dimensionality.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-WordThres=" + wordAppThresEmbeddings.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-IsLowercased=" + isLowercaseWordEmbeddings.elementAt(i) + "\n";
}
}
// BrownClusterPaths feature
String brownDebug = "";
if (rm.containsKey("BrownClusterPaths") && rm.getString("BrownClusterPaths").equals("1")) {
Vector<String> pathsToBrownClusters = // list
getStringVector(rm.getString("pathsToBrownClusters").split("\\s+"));
Vector<Integer> minWordAppThresholdsForBrownClusters = getIntegerVector(rm.getString("minWordAppThresholdsForBrownClusters").split(// list
"\\s+"));
Vector<Boolean> lowercaseBrown = // list
getBooleanVector(rm.getString("isLowercaseBrownClusters").split("\\s+"));
// Check that vectors are all the same length
int standard = pathsToBrownClusters.size();
if (minWordAppThresholdsForBrownClusters.size() != standard || lowercaseBrown.size() != standard) {
throw new IllegalArgumentException("Config file error: all resources for BrownClusters " + "(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, " + "isLowercaseBrownClusters) need to have the same number of parameters.");
}
BrownClusters.init(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, lowercaseBrown);
// For output later
for (int i = 0; i < pathsToBrownClusters.size(); i++) {
brownDebug += "Brown clusters resource: \n";
brownDebug += "\t-Path: " + pathsToBrownClusters.elementAt(i) + "\n";
brownDebug += "\t-WordThres=" + minWordAppThresholdsForBrownClusters.elementAt(i) + "\n";
brownDebug += "\t-IsLowercased=" + lowercaseBrown.elementAt(i) + "\n";
}
}
param.randomNoiseLevel = randomNoiseLevel;
param.omissionRate = omissionRate;
// don't forget that these should be initialized only after we know the target labels
// and the encoding scheme
param.patternLabelRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.level1AggregationRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.prevPredictionsLevel1RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.prevPredictionsLevel2RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
} catch (IOException e) {
e.printStackTrace();
}
return param;
}
use of edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager in project cogcomp-nlp by CogComp.
the class PipelineFactory method buildPipeline.
/**
* create an AnnotatorService with components specified by the ResourceManager (to override
* defaults in {@link PipelineConfigurator}
*
* @param rm non-default config options
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(ResourceManager rm) throws IOException, AnnotatorException {
// Merges default configuration with the user-specified overrides.
ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(rm));
Boolean splitOnDash = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH);
boolean isSentencePipeline = fullRm.getBoolean(PipelineConfigurator.USE_SENTENCE_PIPELINE.key);
if (isSentencePipeline) {
// update cache directory to be distinct from regular pipeline
String cacheDir = fullRm.getString(AnnotatorServiceConfigurator.CACHE_DIR.key);
cacheDir += "_sentence";
Properties props = fullRm.getProperties();
props.setProperty(AnnotatorServiceConfigurator.CACHE_DIR.key, cacheDir);
fullRm = new ResourceManager(props);
}
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnDash));
Map<String, Annotator> annotators = buildAnnotators(fullRm);
return isSentencePipeline ? new SentencePipeline(taBldr, annotators, fullRm) : new BasicAnnotatorService(taBldr, annotators, fullRm);
}
use of edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager in project cogcomp-nlp by CogComp.
the class NerInitTest method testInit.
@Test
public void testInit() {
Properties props = new Properties();
props.setProperty(NerBaseConfigurator.GAZETTEER_FEATURES, "0");
props.setProperty(NerBaseConfigurator.BROWN_CLUSTER_PATHS, "0");
ResourceManager rm = (new NerBaseConfigurator()).getConfig(new ResourceManager(props));
NERAnnotator ner = NerAnnotatorManager.buildNerAnnotator(rm, ViewNames.NER_CONLL);
assertNotNull(ner);
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = tab.createTextAnnotation(TESTSTR);
try {
ner.getView(ta);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
assert (ta.hasView(ViewNames.NER_CONLL));
assertEquals(ta.getView(ViewNames.NER_CONLL).getConstituents().size(), 2);
}
use of edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager in project cogcomp-nlp by CogComp.
the class NerBaseConfigurator method getDefaultConfig.
@Override
public ResourceManager getDefaultConfig() {
Properties props = new Properties();
props.setProperty(VIEW_NAME, ViewNames.NER_CONLL);
props.setProperty(PHRASE_LENGTH, DEFAULT_PHRASE_LENGTH);
props.setProperty(PATH_TO_MODEL, DEFAULT_MODEL_PATH);
props.setProperty(MODEL_NAME, DEFAULT_MODEL_NAME);
props.setProperty(AFFIXES, DEFAULT_AFFIXES);
props.setProperty(AGGREGATE_CONTEXT, DEFAULT_AGGREGATE_CONTENT);
props.setProperty(AGGREGATE_GAZETTEER, DEFAULT_AGGREGATE_GAZETTEER);
props.setProperty(DEBUG, DEFAULT_DEBUG);
props.setProperty(BROWN_CLUSTER_PATHS, DEFAULT_BROWN_CLUSTER_PATHS);
props.setProperty(CAPITALIZATION, DEFAULT_CAPITALIZATION);
props.setProperty(FORCE_NEW_SENTENCE_ON_LINE_BREAKS, DEFAULT_FORCE_LINE_BREAKS);
props.setProperty(FORMS, DEFAULT_FORMS);
props.setProperty(GAZETTEER_FEATURES, DEFAULT_GAZETTEER_FEATURES);
props.setProperty(IS_LOWERCASE_BROWN_CLUSTERS, DEFAULT_IS_LOWERCASE_BROWN_CLUSTERS);
props.setProperty(LABEL_TYPES, DEFAULT_LABELS);
props.setProperty(NORMALIZE_TITLE_TEXT, DEFAULT_NORMALIZE_TITLE);
// props.setProperty(TOKENIZATION_SCHEME, DEFAULT_TOKENIZATION_SCHEME);
props.setProperty(TAG_SCHEME, DEFAULT_TAG_SCHEME);
props.setProperty(TREAT_ALL_FILES_AS_ONE, DEFAULT_TREAT_ALL_FILES_AS_ONE);
props.setProperty(PATH_TO_TOKEN_NORM_DATA, DEFAULT_PATH_TO_TOKEN_NORM_DATA);
props.setProperty(MIN_CONFIDENCE_PREDICTIONS_1, DEFAULT_MIN_CONFIDENCE_PREDICTIONS_1);
props.setProperty(MIN_CONFIDENCE_PREDICTIONS_2, DEFAULT_MIN_CONFIDENCE_PREDICTIONS_2);
props.setProperty(SORT_FILES_LEXICALLY, DEFAULT_SORT_FILES_LEXICALLY);
props.setProperty(PREV_TAG_1, DEFAULT_PREV_TAG_1);
props.setProperty(PREV_TAG_2, DEFAULT_PREV_TAG_2);
props.setProperty(PREV_TAG_PATTERN_1, DEFAULT_PREV_TAG_PATTERN_1);
props.setProperty(PREV_TAG_PATTERN_2, DEFAULT_PREV_TAG_PATTERN_2);
props.setProperty(PREV_TAGS_FOR_CONTEXT, DEFAULT_PREV_TAGS_FOR_CONTEXT);
props.setProperty(WORD_TYPE_INFORMATION, DEFAULT_WORD_TYPE_INFORMATION);
props.setProperty(PREDICTIONS_1, DEFAULT_PREDICTIONS_1);
props.setProperty(PATHS_TO_BROWN_CLUSTERS, DEFAULT_PATHS_TO_BROWN_CLUSTERS);
props.setProperty(WORD_EMBEDDINGS, DEFAULT_WORD_EMBEDDINGS);
props.setProperty(PATH_TO_GAZETTEERS, DEFAULT_PATHS_TO_GAZETTEERS);
props.setProperty(MIN_WORD_APP_THRESHOLDS_FOR_BROWN_CLUSTERS, DEFAULT_MIN_WORD_APP_THRESHOLDS_FOR_BROWN_CLUSTERS);
props.setProperty(RANDOM_NOISE_LEVEL, DEFAULT_RANDOM_NOISE_LEVEL);
props.setProperty(OMISSION_RATE, DEFAULT_OMISSION_RATE);
props.setProperty(IS_LAZILY_INITIALIZED.key, TRUE);
props.setProperty(LANGUAGE, DEFAULT_LANGUAGE);
props.setProperty(IS_SENTENCE_LEVEL.key, IS_SENTENCE_LEVEL.value);
return new ResourceManager(props);
}
Aggregations