use of zemberek.langid.LanguageIdentifier in project zemberek-nlp by ahmetaa.
the class LanguageIdServiceImpl method detectFast.
@Override
public void detectFast(LanguageIdRequest request, StreamObserver<LanguageIdResponse> responseObserver) {
LanguageIdentifier identifier = request.getTrGroup() ? languageIdentifierTr : languageIdentifier;
String id = identifier.identifyFast(request.getInput(), request.getMaxSampleCount());
LanguageIdResponse.Builder builder = LanguageIdResponse.newBuilder().setLangId(id);
if (request.getIncludeScores()) {
List<LanguageIdentifier.IdResult> scores = identifier.getScoresFast(request.getInput(), request.getMaxSampleCount());
for (LanguageIdentifier.IdResult item : scores) {
builder.addIdResult(IdResult.newBuilder().setId(item.id).setScore(item.score).build());
}
}
responseObserver.onNext(builder.build());
responseObserver.onCompleted();
}
use of zemberek.langid.LanguageIdentifier in project zemberek-nlp by ahmetaa.
the class WordHistogram method removeNonTurkish.
static List<String> removeNonTurkish(Path input) throws IOException {
LanguageIdentifier identifier = LanguageIdentifier.fromInternalModels();
List<String> chunks = Files.readAllLines(input, StandardCharsets.UTF_8);
return chunks.stream().filter(s -> identifier.identifyFast(s, 200).equalsIgnoreCase("tr")).collect(Collectors.toList());
}
use of zemberek.langid.LanguageIdentifier in project zemberek-nlp by ahmetaa.
the class ProperNounLanguage method main.
public static void main(String[] args) throws IOException {
List<String> candidates = Files.readAllLines(Paths.get("/home/ahmetaa/projects/zemberek-nlp/zemberek.proper.vocab"));
List<String> potentiallyForeign = new ArrayList<>();
LanguageIdentifier lid = LanguageIdentifier.fromInternalModelGroup("tr_group");
for (String candidate : candidates) {
String l = lid.identify(candidate);
if (l.equals("en")) {
potentiallyForeign.add(candidate);
}
}
Files.write(Paths.get("/home/ahmetaa/projects/zemberek-nlp/zemberek.proper.vocab.en"), potentiallyForeign);
}
use of zemberek.langid.LanguageIdentifier in project zemberek-nlp by ahmetaa.
the class Trainer method generateModelsToDir.
private void generateModelsToDir(File countDir, File modelDir, String[] languages, boolean compressed) throws IOException {
LanguageIdentifier identifier = LanguageIdentifier.generateFromCounts(countDir, languages);
List<CharNgramLanguageModel> models = identifier.getModels();
mkDir(modelDir);
for (CharNgramLanguageModel model : models) {
System.out.println("Generating model for:" + model.getId());
MapBasedCharNgramLanguageModel mbm = (MapBasedCharNgramLanguageModel) model;
if (compressed) {
File modelFile = new File(modelDir, model.getId() + ".clm");
CompressedCharNgramModel.compress(mbm, modelFile);
} else {
File modelFile = new File(modelDir, model.getId() + ".lm");
mbm.saveCustom(modelFile);
}
}
}
use of zemberek.langid.LanguageIdentifier in project zemberek-nlp by ahmetaa.
the class LanguageIdServiceImpl method detect.
@Override
public void detect(LanguageIdRequest request, StreamObserver<LanguageIdResponse> responseObserver) {
LanguageIdentifier identifier = request.getTrGroup() ? languageIdentifierTr : languageIdentifier;
String id = identifier.identify(request.getInput(), request.getMaxSampleCount());
LanguageIdResponse.Builder builder = LanguageIdResponse.newBuilder().setLangId(id);
if (request.getIncludeScores()) {
List<LanguageIdentifier.IdResult> scores = identifier.getScores(request.getInput(), request.getMaxSampleCount());
for (LanguageIdentifier.IdResult item : scores) {
builder.addIdResult(IdResult.newBuilder().setId(item.id).setScore(item.score).build());
}
}
responseObserver.onNext(builder.build());
responseObserver.onCompleted();
}
Aggregations