use of zemberek.morphology.analysis.AnalysisCache in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method cleanTwitterData.
static void cleanTwitterData(Path in, Path out) throws Exception {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 20) {
threadCount = 20;
}
ExecutorService executorService = new BlockingExecutor(threadCount);
CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
int blockSize = 20_000;
BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
Path foreign = Paths.get(out.toString() + ".foreign");
TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
int bc = 0;
for (TextChunk block : loader) {
service.submit(new TwitterTask(morphology, saver, block, bc));
bc++;
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.morphology.analysis.AnalysisCache in project zemberek-nlp by ahmetaa.
the class StemDisambiguationExperiment method main.
public static void main(String[] args) throws IOException {
StemDisambiguationExperiment experiment = new StemDisambiguationExperiment();
Path root = Paths.get("/media/ahmetaa/depo/corpora");
experiment.input = root;
experiment.dirList = root.resolve("dis-list");
experiment.output = Paths.get("/media/ahmetaa/depo/out/foo");
AnalysisCache cache = AnalysisCache.builder().staticCacheSize(50_000).dynamicCacheSize(50_000, 200_000).build();
morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).setCache(cache).build();
experiment.doit();
}
use of zemberek.morphology.analysis.AnalysisCache in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method getTurkishMorphology.
static TurkishMorphology getTurkishMorphology(boolean asciiTolerant) throws IOException {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(200_000, 400_000).build();
RootLexicon lexicon = TurkishDictionaryLoader.loadFromResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict");
TurkishMorphology.Builder builder = TurkishMorphology.builder().setLexicon(lexicon).disableUnidentifiedTokenAnalyzer().setCache(cache);
if (asciiTolerant) {
builder.ignoreDiacriticsInAnalysis();
}
return builder.build();
}
Aggregations