use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class NormalizationScripts method cleanTwitterData.
static void cleanTwitterData(Path in, Path out) throws Exception {
AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
int threadCount = Runtime.getRuntime().availableProcessors() / 2;
if (threadCount > 20) {
threadCount = 20;
}
ExecutorService executorService = new BlockingExecutor(threadCount);
CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
int blockSize = 20_000;
BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
Path foreign = Paths.get(out.toString() + ".foreign");
TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
int bc = 0;
for (TextChunk block : loader) {
service.submit(new TwitterTask(morphology, saver, block, bc));
bc++;
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
}
use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class QuestionClassifier method collectCoarseData.
private static void collectCoarseData() throws IOException {
Path root = Paths.get("/media/ahmetaa/depo/corpora/open-subtitles-tr-2018-small");
BlockTextLoader corpusProvider = BlockTextLoader.fromDirectory(root);
LinkedHashSet<String> questions = new LinkedHashSet<>();
LinkedHashSet<String> notQuestions = new LinkedHashSet<>();
Random rnd = new Random();
int quesCount = 200_000;
int noQuesCount = 300_000;
for (TextChunk chunk : corpusProvider) {
for (String line : chunk) {
if (line.length() > 80) {
continue;
}
if (line.endsWith("?") && questions.size() < quesCount) {
int r = rnd.nextInt(3);
if (r < 2) {
questions.add("__label__question " + line);
} else {
questions.add("__label__question " + line.replaceAll("\\?", "").trim());
}
} else {
if (notQuestions.size() < noQuesCount) {
notQuestions.add("__label__not_question " + line);
}
}
}
if (questions.size() == quesCount && notQuestions.size() == noQuesCount) {
break;
}
}
Path outQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/questions-raw");
Files.write(outQ, questions, StandardCharsets.UTF_8);
Path outNotQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/not_questions-raw");
Files.write(outNotQ, questions, StandardCharsets.UTF_8);
List<String> all = new ArrayList<>(questions);
all.addAll(notQuestions);
Collections.shuffle(all);
Path allData = Paths.get("/media/ahmetaa/depo/classification/question/coarse/all-raw");
Files.write(allData, all, StandardCharsets.UTF_8);
}
use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.
the class NormalizationVocabularyGenerator method collectVocabularyHistogram.
Vocabulary collectVocabularyHistogram(BlockTextLoader corpora, int threadCount) throws Exception {
ExecutorService executorService = new BlockingExecutor(threadCount);
Vocabulary result = new Vocabulary();
for (TextChunk chunk : corpora) {
Log.info("Processing %s", chunk);
executorService.submit(new WordCollectorTask(chunk, result));
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.DAYS);
return result;
}
Aggregations