Search in sources :

Example 1 with CompressedCharNgramModel

use of zemberek.langid.model.CompressedCharNgramModel in project zemberek-nlp by ahmetaa.

the class LanguageIdentifier method fromInternalModels.

/**
 * Loads all internal models from internal compressed resource folder.
 *
 * @return LanguageIdentifier
 * @throws IOException In case of an IO error.
 */
public static LanguageIdentifier fromInternalModels() throws IOException {
    String[] languages = Language.allLanguages();
    Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
    Set<String> langs = Sets.newHashSet(languages);
    for (String language : langs) {
        String resourceName = "/models/compressed/" + language + ".clm";
        InputStream is = Resources.getResource(LanguageIdentifier.class, resourceName).openStream();
        if (is == null) {
            throw new IllegalArgumentException("No internal model found: " + resourceName);
        }
        CompressedCharNgramModel model = CompressedCharNgramModel.load(is);
        map.put(language, model);
    }
    return new LanguageIdentifier(map);
}
Also used : CharNgramLanguageModel(zemberek.langid.model.CharNgramLanguageModel) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel) InputStream(java.io.InputStream) CompressedCharNgramModel(zemberek.langid.model.CompressedCharNgramModel)

Example 2 with CompressedCharNgramModel

use of zemberek.langid.model.CompressedCharNgramModel in project zemberek-nlp by ahmetaa.

the class LanguageIdentifier method fromInternalModelGroup.

/**
 * Loads internal models from internal compressed resource folder. Such as /models/langid has a
 * folder named tr_group. It contains a group of language and unk compressed models. for loading
 * those modeles, fromInternalModelGroup("tr_group") should be called.
 *
 * @param groupId internal folder name
 * @return LanguageIdentifier
 * @throws IOException In case of an IO error.
 */
public static LanguageIdentifier fromInternalModelGroup(String groupId) throws IOException {
    Set<String> languages = Sets.newHashSet();
    String languageList = "/models/" + groupId + "/langs.txt";
    try (InputStream is = Resources.getResource(LanguageIdentifier.class, languageList).openStream()) {
        String langLine = SimpleTextReader.trimmingReader(is, "utf-8").asString().trim();
        for (String langStr : Splitter.on(",").omitEmptyStrings().trimResults().split(langLine)) {
            languages.add(langStr);
        }
    }
    if (languages.size() == 0) {
        throw new IllegalArgumentException("No language is provided!");
    }
    Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
    for (String language : languages) {
        String resourceName = "/models/" + groupId + "/" + language + ".clm";
        InputStream is = Resources.getResource(LanguageIdentifier.class, resourceName).openStream();
        if (is == null) {
            throw new IllegalArgumentException("No internal model found: " + resourceName);
        }
        CompressedCharNgramModel model = CompressedCharNgramModel.load(is);
        map.put(language, model);
    }
    return new LanguageIdentifier(map);
}
Also used : CharNgramLanguageModel(zemberek.langid.model.CharNgramLanguageModel) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel) InputStream(java.io.InputStream) CompressedCharNgramModel(zemberek.langid.model.CompressedCharNgramModel)

Aggregations

InputStream (java.io.InputStream)2 CharNgramLanguageModel (zemberek.langid.model.CharNgramLanguageModel)2 CompressedCharNgramModel (zemberek.langid.model.CompressedCharNgramModel)2 MapBasedCharNgramLanguageModel (zemberek.langid.model.MapBasedCharNgramLanguageModel)2