use of zemberek.langid.model.MapBasedCharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method generateFromCounts.
public static LanguageIdentifier generateFromCounts(File countModelsDir, String[] languages) throws IOException {
Map<String, File> modelFileMap = Maps.newHashMap();
Map<String, CharNgramLanguageModel> modelMap = Maps.newHashMap();
File[] allFiles = countModelsDir.listFiles();
int order = 3;
if (allFiles == null || allFiles.length == 0) {
throw new IllegalArgumentException("There is no file in:" + countModelsDir);
}
for (File file : allFiles) {
final String langStr = file.getName().substring(0, file.getName().indexOf("."));
modelFileMap.put(langStr, file);
}
// generate models for required models on the fly.
Log.info("Generating models for:" + Arrays.toString(languages));
for (String language : languages) {
String l = language.toLowerCase();
if (modelFileMap.containsKey(l)) {
CharNgramCountModel countModel = CharNgramCountModel.load(modelFileMap.get(l));
order = countModel.order;
MapBasedCharNgramLanguageModel lm = MapBasedCharNgramLanguageModel.train(countModel);
modelMap.put(l, lm);
modelFileMap.remove(l);
} else {
Log.warn("Cannot find count model file for language " + language);
}
}
// generate garbage model from the remaining files if any left.
if (!modelFileMap.isEmpty()) {
Log.info("Generating garbage model from remaining count models.");
CharNgramCountModel garbageModel = new CharNgramCountModel("unk", order);
for (File file : modelFileMap.values()) {
garbageModel.merge(CharNgramCountModel.load(file));
}
MapBasedCharNgramLanguageModel lm = MapBasedCharNgramLanguageModel.train(garbageModel);
modelMap.put(lm.getId(), lm);
}
return new LanguageIdentifier(modelMap);
}
use of zemberek.langid.model.MapBasedCharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class Trainer method generateModelsToDir.
private void generateModelsToDir(File countDir, File modelDir, String[] languages, boolean compressed) throws IOException {
LanguageIdentifier identifier = LanguageIdentifier.generateFromCounts(countDir, languages);
List<CharNgramLanguageModel> models = identifier.getModels();
mkDir(modelDir);
for (CharNgramLanguageModel model : models) {
System.out.println("Generating model for:" + model.getId());
MapBasedCharNgramLanguageModel mbm = (MapBasedCharNgramLanguageModel) model;
if (compressed) {
File modelFile = new File(modelDir, model.getId() + ".clm");
CompressedCharNgramModel.compress(mbm, modelFile);
} else {
File modelFile = new File(modelDir, model.getId() + ".lm");
mbm.saveCustom(modelFile);
}
}
}
Aggregations