use of edu.illinois.cs.cogcomp.core.constants.Language in project cogcomp-nlp by CogComp.
the class Parameters method readAndLoadConfig.
/**
* This is the method that does all the work. This populates and returns a
* {@link ParametersForLbjCode} object, which is then used throughout the codebase.
*
* @param rm a populated <code>ResourceManager</code>.
* @param areWeTraining this value determines whether or not this run will involve training a
* model. If we are training, then we make sure there exists a folder in which to put the
* trained model. If not, then we make sure the model exists.
* @return a {@link ParametersForLbjCode} object populated according to the
* <code>ResourceManager</code> argument.
*/
public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean areWeTraining) {
ParametersForLbjCode param = new ParametersForLbjCode();
try {
// First check for any empty answers (NOT allowed):
Enumeration<Object> enumeration = rm.getKeys();
while (enumeration.hasMoreElements()) {
String el = (String) enumeration.nextElement();
if (rm.getString(el).isEmpty()) {
throw new IllegalArgumentException("Config File Error: parameter " + el + " has no value. Either comment it out (with #), or remove it.");
}
}
param.debug = rm.getDebug();
// ParametersForLbjCode.currentParameters.debug = param.debug;
double randomNoiseLevel = rm.getDouble(NerBaseConfigurator.RANDOM_NOISE_LEVEL);
double omissionRate = rm.getDouble(NerBaseConfigurator.OMISSION_RATE);
// Required params
String cFilename = rm.getString(NerBaseConfigurator.MODEL_NAME);
if (cFilename == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional '" + NerBaseConfigurator.MODEL_NAME + "'");
}
param.configFilename = cFilename;
if (rm.containsKey("language")) {
Language lang = Language.getLanguageByCode(rm.getString("language"));
param.language = lang;
}
if (rm.containsKey("labelsToAnonymizeInEvaluation")) {
String labelsToAnonymizeInEvaluation = rm.getString("labelsToAnonymizeInEvaluation");
param.labelsToAnonymizeInEvaluation = new Vector<>(Arrays.asList(labelsToAnonymizeInEvaluation.split(" ")));
}
if (rm.containsKey("labelsToIgnoreInEvaluation")) {
String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation");
param.labelsToIgnoreInEvaluation = new Vector<>(Arrays.asList(labelsToIgnoreInEvaluation.split(" ")));
}
if (rm.getString("pathToModelFile") == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'pathToModelFile'");
}
param.pathToModelFile = rm.getString("pathToModelFile") + "/" + param.configFilename + ".model";
String taggingEncodingSchemeString = rm.getString("taggingEncodingScheme");
if (taggingEncodingSchemeString == null) {
throw new IllegalArgumentException("Config File Error: Expected value for non-optional 'taggingEncodingScheme'");
}
param.taggingEncodingScheme = TextChunkRepresentationManager.EncodingScheme.valueOf(taggingEncodingSchemeString);
// Optional params
if (rm.containsKey("auxiliaryModels")) {
String auxListString = rm.getString("auxiliaryModels");
// should be a list
String[] auxModels = auxListString.split("\\t");
// FIXME: add func so that if auxModels.length is odd, then we have a problem...
for (int i = 0; i < auxModels.length; i += 2) {
ResourceManager auxRm = new ResourceManager(auxModels[i]);
// loading auxiliary
ParametersForLbjCode aux = readAndLoadConfig(auxRm, false);
// models, never
// training
aux.nameAsAuxFeature = auxModels[i + 1];
loadClassifierModels(aux, param);
param.auxiliaryModels.addElement(aux);
}
}
if (rm.containsKey("normalizeTitleText")) {
param.normalizeTitleText = Boolean.parseBoolean(rm.getString("normalizeTitleText"));
}
if (rm.containsKey("pathToTokenNormalizationData")) {
param.pathToTokenNormalizationData = rm.getString("pathToTokenNormalizationData");
TitleTextNormalizer.pathToBrownClusterForWordFrequencies = param.pathToTokenNormalizationData;
}
if (rm.containsKey("forceNewSentenceOnLineBreaks")) {
param.forceNewSentenceOnLineBreaks = Boolean.parseBoolean(rm.getString("forceNewSentenceOnLineBreaks"));
}
if (rm.containsKey("sortLexicallyFilesInFolders")) {
param.sortLexicallyFilesInFolders = Boolean.parseBoolean(rm.getString("sortLexicallyFilesInFolders"));
}
if (rm.containsKey("treatAllFilesInFolderAsOneBigDocument")) {
param.treatAllFilesInFolderAsOneBigDocument = Boolean.parseBoolean(rm.getString("treatAllFilesInFolderAsOneBigDocument"));
}
if (rm.containsKey("minConfidencePredictionsLevel1")) {
param.minConfidencePredictionsLevel1 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel1"));
}
if (rm.containsKey("minConfidencePredictionsLevel2")) {
param.minConfidencePredictionsLevel2 = Double.parseDouble(rm.getString("minConfidencePredictionsLevel2"));
}
if (rm.containsKey("learningRatePredictionsLevel1")) {
param.learningRatePredictionsLevel1 = Double.parseDouble(rm.getString("learningRatePredictionsLevel1"));
}
if (rm.containsKey("learningRatePredictionsLevel2")) {
param.learningRatePredictionsLevel2 = Double.parseDouble(rm.getString("learningRatePredictionsLevel2"));
}
if (rm.containsKey("thicknessPredictionsLevel1")) {
param.thicknessPredictionsLevel1 = Integer.parseInt(rm.getString("thicknessPredictionsLevel1"));
}
if (rm.containsKey("thicknessPredictionsLevel2")) {
param.thicknessPredictionsLevel2 = Integer.parseInt(rm.getString("thicknessPredictionsLevel2"));
}
// labelTypes is just a String[]
if (rm.containsKey("labelTypes")) {
// split on whitespace
param.labelTypes = rm.getString("labelTypes").split("\\s+");
}
// Inclusion of all the features
param.featuresToUse = new HashMap<>();
for (String feature : possibleFeatures) {
if (rm.containsKey(feature) && rm.getString(feature).equals("1")) {
logger.debug("Adding feature: {}", feature);
param.featuresToUse.put(feature, true);
}
}
// Default positive features
param.featuresToUse.put("TitleNormalization", true);
param.featuresToUse.put("WordTopicTitleInfo", true);
// if enabled, load up the gazetteers.
if (rm.containsKey("GazetteersFeatures") && rm.getString("GazetteersFeatures").equals("1")) {
String pathToGazetteersLists = rm.getString("pathToGazetteersLists");
if (rm.containsKey("FlatGazetteers") && Boolean.parseBoolean(rm.getString("FlatGazetteers"))) {
logger.info("Loading FlatGazetteers");
param.gazetteers = GazetteersFactory.get(5, pathToGazetteersLists, true, param.language);
} else {
int maxPhraseLength = 5;
if (rm.containsKey(NerBaseConfigurator.PHRASE_LENGTH))
maxPhraseLength = rm.getInt(NerBaseConfigurator.PHRASE_LENGTH);
param.gazetteers = GazetteersFactory.get(maxPhraseLength, pathToGazetteersLists, false, param.language);
}
}
// WordEmbeddings feature
String wordEmbeddingDebug = "";
if (rm.containsKey("WordEmbeddings") && rm.getString("WordEmbeddings").equals("1")) {
Vector<String> pathsToWordEmbeddings = // list
getStringVector(rm.getString("pathsToWordEmbeddings").split("\\s+"));
Vector<Integer> dimensionality = // list
getIntegerVector(rm.getString("embeddingDimensionalities").split("\\s+"));
Vector<Integer> wordAppThresEmbeddings = getIntegerVector(rm.getString("minWordAppThresholdsForEmbeddings").split(// list Note: look for minWordAppThresholdsForEmbeddings
"\\s+"));
// FIXME: check all others for things like this
Vector<Boolean> isLowercaseWordEmbeddings = // list
getBooleanVector(rm.getString("isLowercaseWordEmbeddings").split("\\s+"));
Vector<Double> normalizationConstantsForEmbeddings = getDoubleVector(rm.getString("normalizationConstantsForEmbeddings").split(// list
"\\s+"));
Vector<NormalizationMethod> normalizationMethodsForEmbeddings = // list
getNMVector(rm.getString("normalizationMethodsForEmbeddings").split("\\s+"));
// Check that all vectors are the same length
int standard = pathsToWordEmbeddings.size();
if (dimensionality.size() != standard || wordAppThresEmbeddings.size() != standard || isLowercaseWordEmbeddings.size() != standard || normalizationConstantsForEmbeddings.size() != standard || normalizationMethodsForEmbeddings.size() != standard) {
throw new IllegalArgumentException("Config file error: all resources for WordEmbeddings " + "(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, " + "isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, " + "normalizationMethodsForEmbeddings) need to have the same number of parameters.");
}
WordEmbeddings.init(pathsToWordEmbeddings, dimensionality, wordAppThresEmbeddings, isLowercaseWordEmbeddings, normalizationConstantsForEmbeddings, normalizationMethodsForEmbeddings);
for (int i = 0; i < pathsToWordEmbeddings.size(); i++) {
wordEmbeddingDebug += "Words Embeddings resource: \n";
wordEmbeddingDebug += "\t-Path: " + pathsToWordEmbeddings.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-Dimensionality=" + dimensionality.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-WordThres=" + wordAppThresEmbeddings.elementAt(i) + "\n";
wordEmbeddingDebug += "\t-IsLowercased=" + isLowercaseWordEmbeddings.elementAt(i) + "\n";
}
}
// If enabled, load up the brown clusters
String brownDebug = "";
if (rm.containsKey("BrownClusterPaths") && rm.getString("BrownClusterPaths").equals("1")) {
Vector<String> pathsToBrownClusters = // list
getStringVector(rm.getString("pathsToBrownClusters").split("\\s+"));
Vector<Integer> minWordAppThresholdsForBrownClusters = getIntegerVector(rm.getString("minWordAppThresholdsForBrownClusters").split(// list
"\\s+"));
Vector<Boolean> lowercaseBrown = // list
getBooleanVector(rm.getString("isLowercaseBrownClusters").split("\\s+"));
// Check that vectors are all the same length
int standard = pathsToBrownClusters.size();
if (minWordAppThresholdsForBrownClusters.size() != standard || lowercaseBrown.size() != standard) {
throw new IllegalArgumentException("Config file error: all resources for BrownClusters " + "(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, " + "isLowercaseBrownClusters) need to have the same number of parameters.");
}
boolean useLocalBrownCluster = false;
if (rm.containsKey("UseLocalBrownCluster") && rm.getString("UseLocalBrownCluster").equals("true")) {
useLocalBrownCluster = true;
}
param.brownClusters = BrownClusters.get(pathsToBrownClusters, minWordAppThresholdsForBrownClusters, lowercaseBrown);
// For output later
for (int i = 0; i < pathsToBrownClusters.size(); i++) {
brownDebug += "Brown clusters resource: \n";
brownDebug += "\t-Path: " + pathsToBrownClusters.elementAt(i) + "\n";
brownDebug += "\t-WordThres=" + minWordAppThresholdsForBrownClusters.elementAt(i) + "\n";
brownDebug += "\t-IsLowercased=" + lowercaseBrown.elementAt(i) + "\n";
}
}
param.randomNoiseLevel = randomNoiseLevel;
param.omissionRate = omissionRate;
param.featurePruningThreshold = rm.getDouble(NerBaseConfigurator.FEATUREPRUNINGTHRESHOLD);
// don't forget that these should be initialized only after we know the target labels
// and the encoding scheme
param.patternLabelRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.level1AggregationRandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.prevPredictionsLevel1RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
param.prevPredictionsLevel2RandomGenerator = new RandomLabelGenerator(param.labelTypes, param.taggingEncodingScheme, randomNoiseLevel);
} catch (IOException e) {
e.printStackTrace();
}
return param;
}
use of edu.illinois.cs.cogcomp.core.constants.Language in project cogcomp-nlp by CogComp.
the class PipelineFactory method buildAnnotators.
/**
* instantiate a set of annotators for use in an AnnotatorService object by default, will use
* lazy initialization where possible -- change this behavior with the
* {@link PipelineConfigurator#USE_LAZY_INITIALIZATION} property.
*
* @param nonDefaultRm ResourceManager with all non-default values for Annotators
* @return a Map from annotator view name to annotator
*/
private static Map<String, Annotator> buildAnnotators(ResourceManager nonDefaultRm) throws IOException {
ResourceManager rm = new PipelineConfigurator().getConfig(new Stanford331Configurator().getConfig(nonDefaultRm));
String timePerSentence = rm.getString(Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
String maxParseSentenceLength = rm.getString(Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
boolean useLazyInitialization = rm.getBoolean(PipelineConfigurator.USE_LAZY_INITIALIZATION.key, PipelineConfigurator.TRUE);
Map<String, Annotator> viewGenerators = new HashMap<>();
if (rm.getBoolean(PipelineConfigurator.USE_POS)) {
POSAnnotator pos = new POSAnnotator();
viewGenerators.put(pos.getViewName(), pos);
}
if (rm.getBoolean(PipelineConfigurator.USE_LEMMA)) {
IllinoisLemmatizer lem = new IllinoisLemmatizer(rm);
viewGenerators.put(lem.getViewName(), lem);
}
if (rm.getBoolean(PipelineConfigurator.USE_SHALLOW_PARSE)) {
viewGenerators.put(ViewNames.SHALLOW_PARSE, new ChunkerAnnotator());
}
if (rm.getBoolean(PipelineConfigurator.USE_NER_CONLL)) {
NERAnnotator nerConll = NerAnnotatorManager.buildNerAnnotator(rm, ViewNames.NER_CONLL);
viewGenerators.put(nerConll.getViewName(), nerConll);
}
if (rm.getBoolean(PipelineConfigurator.USE_NER_ONTONOTES)) {
NERAnnotator nerOntonotes = NerAnnotatorManager.buildNerAnnotator(rm, ViewNames.NER_ONTONOTES);
viewGenerators.put(nerOntonotes.getViewName(), nerOntonotes);
}
if (rm.getBoolean(PipelineConfigurator.USE_DEP)) {
DepAnnotator dep = new DepAnnotator();
viewGenerators.put(dep.getViewName(), dep);
}
if (rm.getBoolean(PipelineConfigurator.USE_STANFORD_DEP) || rm.getBoolean(PipelineConfigurator.USE_STANFORD_PARSE)) {
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", maxParseSentenceLength);
// per sentence? could be per
stanfordProps.put("parse.maxtime", timePerSentence);
// document but no idea from
// stanford javadoc
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
int maxLength = Integer.parseInt(maxParseSentenceLength);
boolean throwExceptionOnSentenceLengthCheck = rm.getBoolean(Stanford331Configurator.THROW_EXCEPTION_ON_FAILED_LENGTH_CHECK.key);
if (rm.getBoolean(PipelineConfigurator.USE_STANFORD_DEP)) {
StanfordDepHandler depParser = new StanfordDepHandler(posAnnotator, parseAnnotator, maxLength, throwExceptionOnSentenceLengthCheck);
viewGenerators.put(depParser.getViewName(), depParser);
}
if (rm.getBoolean(PipelineConfigurator.USE_STANFORD_PARSE)) {
StanfordParseHandler parser = new StanfordParseHandler(posAnnotator, parseAnnotator, maxLength, throwExceptionOnSentenceLengthCheck);
viewGenerators.put(parser.getViewName(), parser);
}
}
if (rm.getBoolean(PipelineConfigurator.USE_SRL_VERB)) {
Properties verbProps = new Properties();
String verbType = SRLType.Verb.name();
verbProps.setProperty(SrlConfigurator.SRL_TYPE.key, verbType);
ResourceManager verbRm = new ResourceManager(verbProps);
rm = Configurator.mergeProperties(rm, verbRm);
try {
SemanticRoleLabeler verbSrl = new SemanticRoleLabeler(rm, useLazyInitialization);
viewGenerators.put(ViewNames.SRL_VERB, verbSrl);
} catch (Exception e) {
throw new IOException("SRL verb cannot init: " + e.getMessage());
}
}
if (rm.getBoolean(PipelineConfigurator.USE_SRL_NOM)) {
Properties nomProps = new Properties();
String nomType = SRLType.Nom.name();
nomProps.setProperty(SrlConfigurator.SRL_TYPE.key, nomType);
ResourceManager nomRm = new ResourceManager(nomProps);
rm = Configurator.mergeProperties(rm, nomRm);
try {
SemanticRoleLabeler nomSrl = new SemanticRoleLabeler(rm, useLazyInitialization);
// note that you can't call nomSrl (or verbSrl).getViewName() as it may not be
// initialized yet
viewGenerators.put(ViewNames.SRL_NOM, nomSrl);
// viewGenerators.put(ViewNames.SRL_NOM,new SrlHandler("NomSRL", "5.1.9", nomType,
// ViewNames.SRL_NOM,
// useLazyInitialization, rm));
} catch (Exception e) {
throw new IOException("SRL nom cannot init .." + e.getMessage());
}
}
if (rm.getBoolean(PipelineConfigurator.USE_QUANTIFIER)) {
Quantifier quantifierAnnotator = new Quantifier();
viewGenerators.put(ViewNames.QUANTITIES, quantifierAnnotator);
}
if (rm.getBoolean(PipelineConfigurator.USE_TRANSLITERATION)) {
for (Language lang : TransliterationAnnotator.supportedLanguages) {
TransliterationAnnotator transliterationAnnotator = new TransliterationAnnotator(true, lang);
viewGenerators.put(ViewNames.TRANSLITERATION + "_" + lang.getCode(), transliterationAnnotator);
}
}
if (rm.getBoolean(PipelineConfigurator.USE_SRL_PREP)) {
PrepSRLAnnotator prepSRLAnnotator = new PrepSRLAnnotator();
viewGenerators.put(ViewNames.SRL_PREP, prepSRLAnnotator);
}
if (rm.getBoolean(PipelineConfigurator.USE_SRL_COMMA)) {
CommaLabeler commaLabeler = new CommaLabeler();
viewGenerators.put(ViewNames.SRL_COMMA, commaLabeler);
}
if (rm.getBoolean(PipelineConfigurator.USE_VERB_SENSE)) {
VerbSenseAnnotator verbSense = new VerbSenseAnnotator();
viewGenerators.put(ViewNames.VERB_SENSE, verbSense);
}
if (rm.getBoolean(PipelineConfigurator.USE_MENTION)) {
MentionAnnotator mentionAnnotator = new MentionAnnotator("ACE_TYPE");
viewGenerators.put(ViewNames.MENTION, mentionAnnotator);
}
if (rm.getBoolean(PipelineConfigurator.USE_RELATION)) {
viewGenerators.put(ViewNames.RELATION, new RelationAnnotator(true));
}
if (rm.getBoolean(PipelineConfigurator.USE_TIMEX3)) {
Properties rmProps = new TemporalChunkerConfigurator().getDefaultConfig().getProperties();
TemporalChunkerAnnotator tca = new TemporalChunkerAnnotator(new ResourceManager(rmProps));
viewGenerators.put(ViewNames.TIMEX3, tca);
}
if (rm.getBoolean(PipelineConfigurator.USE_DATALESS_ESA)) {
rm = new ESADatalessConfigurator().getConfig(nonDefaultRm);
ESADatalessAnnotator esaDataless = new ESADatalessAnnotator(rm);
viewGenerators.put(ViewNames.DATALESS_ESA, esaDataless);
}
if (rm.getBoolean(PipelineConfigurator.USE_DATALESS_W2V)) {
rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm);
W2VDatalessAnnotator w2vDataless = new W2VDatalessAnnotator(rm);
viewGenerators.put(ViewNames.DATALESS_W2V, w2vDataless);
}
if (rm.getBoolean(PipelineConfigurator.USE_QUESTION_TYPER)) {
QuestionTypeAnnotator questionTyper = new QuestionTypeAnnotator();
viewGenerators.put(ViewNames.QUESTION_TYPE, questionTyper);
}
return viewGenerators;
}
Aggregations