use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method fromInternalModels.
/**
* Loads all internal models from internal compressed resource folder.
*
* @return LanguageIdentifier
* @throws IOException In case of an IO error.
*/
public static LanguageIdentifier fromInternalModels() throws IOException {
String[] languages = Language.allLanguages();
Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
Set<String> langs = Sets.newHashSet(languages);
for (String language : langs) {
String resourceName = "/models/compressed/" + language + ".clm";
InputStream is = Resources.getResource(LanguageIdentifier.class, resourceName).openStream();
if (is == null) {
throw new IllegalArgumentException("No internal model found: " + resourceName);
}
CompressedCharNgramModel model = CompressedCharNgramModel.load(is);
map.put(language, model);
}
return new LanguageIdentifier(map);
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method fromInternalModelGroup.
/**
* Loads internal models from internal compressed resource folder. Such as /models/langid has a
* folder named tr_group. It contains a group of language and unk compressed models. for loading
* those modeles, fromInternalModelGroup("tr_group") should be called.
*
* @param groupId internal folder name
* @return LanguageIdentifier
* @throws IOException In case of an IO error.
*/
public static LanguageIdentifier fromInternalModelGroup(String groupId) throws IOException {
Set<String> languages = Sets.newHashSet();
String languageList = "/models/" + groupId + "/langs.txt";
try (InputStream is = Resources.getResource(LanguageIdentifier.class, languageList).openStream()) {
String langLine = SimpleTextReader.trimmingReader(is, "utf-8").asString().trim();
for (String langStr : Splitter.on(",").omitEmptyStrings().trimResults().split(langLine)) {
languages.add(langStr);
}
}
if (languages.size() == 0) {
throw new IllegalArgumentException("No language is provided!");
}
Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
for (String language : languages) {
String resourceName = "/models/" + groupId + "/" + language + ".clm";
InputStream is = Resources.getResource(LanguageIdentifier.class, resourceName).openStream();
if (is == null) {
throw new IllegalArgumentException("No internal model found: " + resourceName);
}
CompressedCharNgramModel model = CompressedCharNgramModel.load(is);
map.put(language, model);
}
return new LanguageIdentifier(map);
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class LanguageIdentifier method scoreWithElimination.
private List<ModelScore> scoreWithElimination(String input, int maxSampleCount) {
int[] samplingPoints;
if (maxSampleCount <= 0) {
samplingPoints = getStepping(input, input.length());
} else {
samplingPoints = getStepping(input, maxSampleCount);
}
List<ModelScore> modelScores = Lists.newArrayListWithCapacity(modelIdArray.length);
for (CharNgramLanguageModel model : models.values()) {
modelScores.add(new ModelScore(model, 0));
}
String[] grams = getGrams(input, samplingPoints);
int gramCounter = 0;
int intervalCounter = 0;
while (gramCounter < grams.length) {
if (intervalCounter == ELIMINATION_SAMPLE_STEP && modelScores.size() > 2) {
intervalCounter = 0;
Collections.sort(modelScores);
modelScores = modelScores.subList(0, modelScores.size() / 2 + 1);
}
for (ModelScore modelScore : modelScores) {
modelScore.score += modelScore.model.gramProbability(grams[gramCounter]);
}
intervalCounter++;
gramCounter++;
}
Collections.sort(modelScores);
return modelScores;
}
use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.
the class Trainer method generateModelsToDir.
private void generateModelsToDir(File countDir, File modelDir, String[] languages, boolean compressed) throws IOException {
LanguageIdentifier identifier = LanguageIdentifier.generateFromCounts(countDir, languages);
List<CharNgramLanguageModel> models = identifier.getModels();
mkDir(modelDir);
for (CharNgramLanguageModel model : models) {
System.out.println("Generating model for:" + model.getId());
MapBasedCharNgramLanguageModel mbm = (MapBasedCharNgramLanguageModel) model;
if (compressed) {
File modelFile = new File(modelDir, model.getId() + ".clm");
CompressedCharNgramModel.compress(mbm, modelFile);
} else {
File modelFile = new File(modelDir, model.getId() + ".lm");
mbm.saveCustom(modelFile);
}
}
}
Aggregations