use of edu.neu.ccs.pyramid.feature_extraction.NgramTemplate in project pyramid by cheng-li.
the class App1 method gather.
static Set<Ngram> gather(Config config, ESIndex index, String[] ids, Logger logger) throws Exception {
File metaDataFolder = new File(config.getString("output.folder"), "meta_data");
metaDataFolder.mkdirs();
Multiset<Ngram> allNgrams = ConcurrentHashMultiset.create();
List<Integer> ns = config.getIntegers("train.feature.ngram.n");
double minDf = config.getDouble("train.feature.ngram.minDf");
int minDFrequency = (int) Math.floor(ids.length * minDf);
List<String> fields = config.getStrings("train.feature.ngram.extractionFields");
List<Integer> slops = config.getIntegers("train.feature.ngram.slop");
for (String field : fields) {
for (int n : ns) {
for (int slop : slops) {
logger.info("gathering " + n + "-grams from field " + field + " with slop " + slop + " and minDf " + minDf + ", (actual frequency threshold = " + minDFrequency + ")");
NgramTemplate template = new NgramTemplate(field, n, slop);
Multiset<Ngram> ngrams = NgramEnumerator.gatherNgram(index, ids, template, minDFrequency);
logger.info("gathered " + ngrams.elementSet().size() + " ngrams");
int newCounter = 0;
for (Multiset.Entry<Ngram> entry : ngrams.entrySet()) {
Ngram ngram = entry.getElement();
int count = entry.getCount();
if (interesting(allNgrams, ngram, count)) {
allNgrams.add(ngram, count);
newCounter += 1;
}
}
logger.info(newCounter + " are really new");
}
}
}
logger.info("there are " + allNgrams.elementSet().size() + " ngrams in total");
// Serialization.serialize(uniques, new File(metaDataFolder, "all_ngrams.ser"));
return allNgrams.elementSet();
}
Aggregations