use of edu.illinois.cs.cogcomp.core.datastructures.Lexicon in project cogcomp-nlp by CogComp.
the class ModelInfo method loadLexicon.
/**
* This function checks if the lexicon file exists. If so, it loads the file. Otherwise, it
* creates a new lexicon.
*/
private Lexicon loadLexicon() throws IOException {
Lexicon lexicon;
String lexiconFile = manager.getLexiconFileName();
URL url = null;
if (Boolean.valueOf(rm.getString(VerbSenseConfigurator.LOAD_MODELS_FROM_DATASTORE.key))) {
url = new File(datastoreModels + File.separator + manager.getLexiconFileName()).toURI().toURL();
} else {
try {
if (!IOUtils.exists(lexiconFile)) {
List<URL> list = IOUtils.lsResources(SenseManager.class, lexiconFile);
if (list.size() > 0)
url = list.get(0);
} else {
url = new File(lexiconFile).toURI().toURL();
}
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
if (url == null) {
log.info("Lexicon file {} missing. Creating new lexicon.", lexiconFile);
lexicon = new Lexicon(true, false);
} else {
log.info("Lexicon file {} found.", lexiconFile);
long start = System.currentTimeMillis();
lexicon = new Lexicon(url.openStream());
long end = System.currentTimeMillis();
log.info("Finished loading lexicon. Took {} ms", (end - start));
}
return lexicon;
}
use of edu.illinois.cs.cogcomp.core.datastructures.Lexicon in project cogcomp-nlp by CogComp.
the class PruningPreExtractor method consume.
@Override
protected void consume(Pair<SenseInstance, SenseStructure> input) {
SenseInstance x = input.getFirst();
SenseStructure y = input.getSecond();
FeatureVector features = x.getCachedFeatureVector();
ModelInfo modelInfo = manager.getModelInfo();
Lexicon lexicon = modelInfo.getLexicon();
int threshold = manager.getPruneSize();
Pair<int[], float[]> pair = lexicon.pruneFeaturesByCount(features.getIdx(), features.getValue(), threshold);
features = new FeatureVector(pair.getFirst(), pair.getSecond());
synchronized (buffer) {
buffer.add(new PreExtractRecord(x.getPredicateLemma(), y.getLabel(), features));
}
if (buffer.size() > 10000) {
synchronized (buffer) {
if (buffer.size() > 10000) {
for (PreExtractRecord r : buffer) {
try {
cache.put(r.lemma, r.label, r.features);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
buffer.clear();
}
}
}
counter.incrementAndGet();
}
use of edu.illinois.cs.cogcomp.core.datastructures.Lexicon in project cogcomp-nlp by CogComp.
the class VerbSenseClassifierMain method preExtract.
@CommandDescription(description = "Pre-extracts the features for the verb-sense model. Run this before training.", usage = "preExtract")
public static void preExtract() throws Exception {
SenseManager manager = getManager(true);
ResourceManager conf = new VerbSenseConfigurator().getDefaultConfig();
// If models directory doesn't exist create it
if (!IOUtils.isDirectory(conf.getString(conf.getString(VerbSenseConfigurator.MODELS_DIRECTORY))))
IOUtils.mkdir(conf.getString(conf.getString(VerbSenseConfigurator.MODELS_DIRECTORY)));
int numConsumers = Runtime.getRuntime().availableProcessors();
Dataset dataset = Dataset.PTBTrainDev;
log.info("Pre-extracting features");
ModelInfo modelInfo = manager.getModelInfo();
String featureSet = "" + modelInfo.featureManifest.getIncludedFeatures().hashCode();
String allDataCacheFile = VerbSenseConfigurator.getFeatureCacheFile(featureSet, dataset, rm);
FeatureVectorCacheFile featureCache = preExtract(numConsumers, manager, dataset, allDataCacheFile);
pruneFeatures(numConsumers, manager, featureCache, VerbSenseConfigurator.getPrunedFeatureCacheFile(featureSet, rm));
Lexicon lexicon = modelInfo.getLexicon().getPrunedLexicon(manager.getPruneSize());
log.info("Saving lexicon with {} features to {}", lexicon.size(), manager.getLexiconFileName());
log.info(lexicon.size() + " features in the lexicon");
lexicon.save(manager.getLexiconFileName());
}
Aggregations