Search in sources :

Example 6 with CharNgramLanguageModel

use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.

the class LanguageIdentifier method fromInternalModels.

/**
 * Loads all internal models from internal compressed resource folder.
 *
 * @return LanguageIdentifier
 * @throws IOException In case of an IO error.
 */
public static LanguageIdentifier fromInternalModels() throws IOException {
    String[] languages = Language.allLanguages();
    Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
    Set<String> langs = Sets.newHashSet(languages);
    for (String language : langs) {
        String resourceName = "/models/compressed/" + language + ".clm";
        InputStream is = Resources.getResource(LanguageIdentifier.class, resourceName).openStream();
        if (is == null) {
            throw new IllegalArgumentException("No internal model found: " + resourceName);
        }
        CompressedCharNgramModel model = CompressedCharNgramModel.load(is);
        map.put(language, model);
    }
    return new LanguageIdentifier(map);
}
Also used : CharNgramLanguageModel(zemberek.langid.model.CharNgramLanguageModel) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel) InputStream(java.io.InputStream) CompressedCharNgramModel(zemberek.langid.model.CompressedCharNgramModel)

Example 7 with CharNgramLanguageModel

use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.

the class LanguageIdentifier method fromInternalModelGroup.

/**
 * Loads internal models from internal compressed resource folder. Such as /models/langid has a
 * folder named tr_group. It contains a group of language and unk compressed models. for loading
 * those modeles, fromInternalModelGroup("tr_group") should be called.
 *
 * @param groupId internal folder name
 * @return LanguageIdentifier
 * @throws IOException In case of an IO error.
 */
public static LanguageIdentifier fromInternalModelGroup(String groupId) throws IOException {
    Set<String> languages = Sets.newHashSet();
    String languageList = "/models/" + groupId + "/langs.txt";
    try (InputStream is = Resources.getResource(LanguageIdentifier.class, languageList).openStream()) {
        String langLine = SimpleTextReader.trimmingReader(is, "utf-8").asString().trim();
        for (String langStr : Splitter.on(",").omitEmptyStrings().trimResults().split(langLine)) {
            languages.add(langStr);
        }
    }
    if (languages.size() == 0) {
        throw new IllegalArgumentException("No language is provided!");
    }
    Map<String, CharNgramLanguageModel> map = Maps.newHashMap();
    for (String language : languages) {
        String resourceName = "/models/" + groupId + "/" + language + ".clm";
        InputStream is = Resources.getResource(LanguageIdentifier.class, resourceName).openStream();
        if (is == null) {
            throw new IllegalArgumentException("No internal model found: " + resourceName);
        }
        CompressedCharNgramModel model = CompressedCharNgramModel.load(is);
        map.put(language, model);
    }
    return new LanguageIdentifier(map);
}
Also used : CharNgramLanguageModel(zemberek.langid.model.CharNgramLanguageModel) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel) InputStream(java.io.InputStream) CompressedCharNgramModel(zemberek.langid.model.CompressedCharNgramModel)

Example 8 with CharNgramLanguageModel

use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.

the class LanguageIdentifier method scoreWithElimination.

private List<ModelScore> scoreWithElimination(String input, int maxSampleCount) {
    int[] samplingPoints;
    if (maxSampleCount <= 0) {
        samplingPoints = getStepping(input, input.length());
    } else {
        samplingPoints = getStepping(input, maxSampleCount);
    }
    List<ModelScore> modelScores = Lists.newArrayListWithCapacity(modelIdArray.length);
    for (CharNgramLanguageModel model : models.values()) {
        modelScores.add(new ModelScore(model, 0));
    }
    String[] grams = getGrams(input, samplingPoints);
    int gramCounter = 0;
    int intervalCounter = 0;
    while (gramCounter < grams.length) {
        if (intervalCounter == ELIMINATION_SAMPLE_STEP && modelScores.size() > 2) {
            intervalCounter = 0;
            Collections.sort(modelScores);
            modelScores = modelScores.subList(0, modelScores.size() / 2 + 1);
        }
        for (ModelScore modelScore : modelScores) {
            modelScore.score += modelScore.model.gramProbability(grams[gramCounter]);
        }
        intervalCounter++;
        gramCounter++;
    }
    Collections.sort(modelScores);
    return modelScores;
}
Also used : CharNgramLanguageModel(zemberek.langid.model.CharNgramLanguageModel) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel)

Example 9 with CharNgramLanguageModel

use of zemberek.langid.model.CharNgramLanguageModel in project zemberek-nlp by ahmetaa.

the class Trainer method generateModelsToDir.

private void generateModelsToDir(File countDir, File modelDir, String[] languages, boolean compressed) throws IOException {
    LanguageIdentifier identifier = LanguageIdentifier.generateFromCounts(countDir, languages);
    List<CharNgramLanguageModel> models = identifier.getModels();
    mkDir(modelDir);
    for (CharNgramLanguageModel model : models) {
        System.out.println("Generating model for:" + model.getId());
        MapBasedCharNgramLanguageModel mbm = (MapBasedCharNgramLanguageModel) model;
        if (compressed) {
            File modelFile = new File(modelDir, model.getId() + ".clm");
            CompressedCharNgramModel.compress(mbm, modelFile);
        } else {
            File modelFile = new File(modelDir, model.getId() + ".lm");
            mbm.saveCustom(modelFile);
        }
    }
}
Also used : CharNgramLanguageModel(zemberek.langid.model.CharNgramLanguageModel) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel) LanguageIdentifier(zemberek.langid.LanguageIdentifier) MapBasedCharNgramLanguageModel(zemberek.langid.model.MapBasedCharNgramLanguageModel) File(java.io.File)

Aggregations

CharNgramLanguageModel (zemberek.langid.model.CharNgramLanguageModel)9 MapBasedCharNgramLanguageModel (zemberek.langid.model.MapBasedCharNgramLanguageModel)9 File (java.io.File)3 InputStream (java.io.InputStream)2 CompressedCharNgramModel (zemberek.langid.model.CompressedCharNgramModel)2 LanguageIdentifier (zemberek.langid.LanguageIdentifier)1 CharNgramCountModel (zemberek.langid.model.CharNgramCountModel)1