use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method scoreFull.
private List<ModelScore> scoreFull(String input, int maxSampleCount) {
int[] samplingPoints;
if (maxSampleCount <= 0) {
samplingPoints = getStepping(input, input.length());
} else {
samplingPoints = getStepping(input, maxSampleCount);
}
List<ModelScore> modelScores = Lists.newArrayListWithCapacity(modelIdArray.length);
for (CharNgramLanguageModel model : models.values()) {
modelScores.add(new ModelScore(model, 0));
}
String[] grams = getGrams(input, samplingPoints);
int gramCounter = 0;
while (gramCounter < grams.length) {
for (ModelScore modelScore : modelScores) {
modelScore.score += modelScore.model.gramProbability(grams[gramCounter]);
}
gramCounter++;
}
Collections.sort(modelScores);
return modelScores;
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method getModelsFromDir.
private static Map<String, CharNgramLanguageModel> getModelsFromDir(File dir, boolean compressed) throws IOException {
Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
if (!dir.exists()) {
throw new IllegalArgumentException("Training data directory does not exist:" + dir);
}
if (!dir.isDirectory()) {
throw new IllegalArgumentException(dir + "is not a directory");
}
File[] allFiles = dir.listFiles();
if (allFiles == null || allFiles.length == 0) {
throw new IllegalArgumentException("There is no file in:" + dir);
}
for (File file : allFiles) {
final String langStr = file.getName().substring(0, file.getName().indexOf("."));
if (compressed) {
map.put(langStr, CompressedCharNgramModel.load(file));
} else {
map.put(langStr, MapBasedCharNgramLanguageModel.loadCustom(file));
}
}
if (map.size() == 0) {
throw new IllegalArgumentException("There is no model file in dir:" + dir);
}
return map;
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method generateFromCounts.
public static LanguageIdentifier generateFromCounts(File countModelsDir, String[] languages) throws IOException {
Map<String, File> modelFileMap = Maps.newHashMap();
Map<String, CharNgramLanguageModel> modelMap = Maps.newHashMap();
File[] allFiles = countModelsDir.listFiles();
int order = 3;
if (allFiles == null || allFiles.length == 0) {
throw new IllegalArgumentException("There is no file in:" + countModelsDir);
}
for (File file : allFiles) {
final String langStr = file.getName().substring(0, file.getName().indexOf("."));
modelFileMap.put(langStr, file);
}
// generate models for required models on the fly.
Log.info("Generating models for:" + Arrays.toString(languages));
for (String language : languages) {
String l = language.toLowerCase();
if (modelFileMap.containsKey(l)) {
CharNgramCountModel countModel = CharNgramCountModel.load(modelFileMap.get(l));
order = countModel.order;
MapBasedCharNgramLanguageModel lm = MapBasedCharNgramLanguageModel.train(countModel);
modelMap.put(l, lm);
modelFileMap.remove(l);
} else {
Log.warn("Cannot find count model file for language " + language);
}
}
// generate garbage model from the remaining files if any left.
if (!modelFileMap.isEmpty()) {
Log.info("Generating garbage model from remaining count models.");
CharNgramCountModel garbageModel = new CharNgramCountModel("unk", order);
for (File file : modelFileMap.values()) {
garbageModel.merge(CharNgramCountModel.load(file));
}
MapBasedCharNgramLanguageModel lm = MapBasedCharNgramLanguageModel.train(garbageModel);
modelMap.put(lm.getId(), lm);
}
return new LanguageIdentifier(modelMap);
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method identifyConf.
private IdResult identifyConf(String input, int[] samplingPoints) {
String[] grams = getGrams(input, samplingPoints);
double[] scores = new double[models.size()];
double max = -Double.MAX_VALUE;
int i = 0;
int best = 0;
double totalScore = LogMath.LOG_ZERO;
for (String modelId : modelIdArray) {
CharNgramLanguageModel charNgramLanguageModel = models.get(modelId);
double prob = 0;
for (String gram : grams) {
prob += charNgramLanguageModel.gramProbability(gram);
}
scores[i] = prob;
totalScore = LogMath.logSum(totalScore, prob);
if (prob > max) {
max = prob;
best = i;
}
i++;
}
return new IdResult(modelIdArray[best], Math.exp(scores[best] - totalScore));
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method identifySamples.
private String identifySamples(String input, int[] samplingPoints) {
String[] grams = getGrams(input, samplingPoints);
double max = -Double.MAX_VALUE;
String maxLanguage = null;
for (CharNgramLanguageModel model : models.values()) {
double prob = 0;
for (String gram : grams) {
prob += model.gramProbability(gram);
}
if (prob > max) {
max = prob;
maxLanguage = model.getId();
}
}
return maxLanguage;
}
Aggregations