use of zemberek.langid.model.CharNgramCountModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method generateFromCounts.
public static LanguageIdentifier generateFromCounts(File countModelsDir, String[] languages) throws IOException {
Map<String, File> modelFileMap = Maps.newHashMap();
Map<String, CharNgramLanguageModel> modelMap = Maps.newHashMap();
File[] allFiles = countModelsDir.listFiles();
int order = 3;
if (allFiles == null || allFiles.length == 0) {
throw new IllegalArgumentException("There is no file in:" + countModelsDir);
}
for (File file : allFiles) {
final String langStr = file.getName().substring(0, file.getName().indexOf("."));
modelFileMap.put(langStr, file);
}
// generate models for required models on the fly.
Log.info("Generating models for:" + Arrays.toString(languages));
for (String language : languages) {
String l = language.toLowerCase();
if (modelFileMap.containsKey(l)) {
CharNgramCountModel countModel = CharNgramCountModel.load(modelFileMap.get(l));
order = countModel.order;
MapBasedCharNgramLanguageModel lm = MapBasedCharNgramLanguageModel.train(countModel);
modelMap.put(l, lm);
modelFileMap.remove(l);
} else {
Log.warn("Cannot find count model file for language " + language);
}
}
// generate garbage model from the remaining files if any left.
if (!modelFileMap.isEmpty()) {
Log.info("Generating garbage model from remaining count models.");
CharNgramCountModel garbageModel = new CharNgramCountModel("unk", order);
for (File file : modelFileMap.values()) {
garbageModel.merge(CharNgramCountModel.load(file));
}
MapBasedCharNgramLanguageModel lm = MapBasedCharNgramLanguageModel.train(garbageModel);
modelMap.put(lm.getId(), lm);
}
return new LanguageIdentifier(modelMap);
}
use of zemberek.langid.model.CharNgramCountModel in project zemberek-nlp by ahmetaa.
the class ModelGenerator method generateModel.
public MapBasedCharNgramLanguageModel generateModel(ModelTrainData modelData) throws IOException {
System.out.println("Training for:" + modelData.modelId + " Training files: " + modelData.modelFiles);
CharNgramCountModel cm = getCountModel(modelData);
return MapBasedCharNgramLanguageModel.train(cm);
}
use of zemberek.langid.model.CharNgramCountModel in project zemberek-nlp by ahmetaa.
the class ModelGenerator method getCountModel.
public CharNgramCountModel getCountModel(ModelTrainData modelTrainData) throws IOException {
CharNgramCountModel countModel = new CharNgramCountModel(modelTrainData.modelId, modelTrainData.order);
for (File file : modelTrainData.modelFiles) {
System.out.println("Processing file:" + file);
int ignoredCount = 0;
Set<String> lines = new HashSet<>(com.google.common.io.Files.readLines(file, Charsets.UTF_8));
for (String line : lines) {
line = line.toLowerCase();
boolean ignore = false;
for (String ignoreWord : ignoreWords) {
if (line.contains(ignoreWord)) {
ignore = true;
ignoredCount++;
break;
}
}
if (!ignore) {
line = LanguageIdentifier.preprocess(line);
countModel.addGrams(line);
}
}
System.out.println("Ignored lines for " + file + " : " + ignoredCount);
}
countModel.applyCutOffs(modelTrainData.cutOffs);
countModel.dumpGrams(1);
return countModel;
}
use of zemberek.langid.model.CharNgramCountModel in project zemberek-nlp by ahmetaa.
the class Trainer method train.
private void train(ModelGenerator.ModelTrainData td) throws IOException {
File countFile = new File(countModelDir, td.modelId + ".count");
CharNgramCountModel cm = modelGenerator.getCountModel(td);
cm.save(countFile);
}
use of zemberek.langid.model.CharNgramCountModel in project zemberek-nlp by ahmetaa.
the class ModelGenerator method generateCountModelToDirectory.
public void generateCountModelToDirectory(File outDir, List<ModelTrainData> modelTrainDataList) throws IOException {
for (ModelTrainData modelTrainData : modelTrainDataList) {
CharNgramCountModel countModel = getCountModel(modelTrainData);
File modelFile = new File(outDir, modelTrainData.modelId + ".count");
countModel.save(modelFile);
}
}
Aggregations