Search in sources :

Example 11 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method cleanTwitterData.

static void cleanTwitterData(Path in, Path out) throws Exception {
    AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
    TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 20) {
        threadCount = 20;
    }
    ExecutorService executorService = new BlockingExecutor(threadCount);
    CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
    int blockSize = 20_000;
    BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
    Path foreign = Paths.get(out.toString() + ".foreign");
    TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
    int bc = 0;
    for (TextChunk block : loader) {
        service.submit(new TwitterTask(morphology, saver, block, bc));
        bc++;
    }
    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.DAYS);
}
Also used : AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) ExecutorService(java.util.concurrent.ExecutorService) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Example 12 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class QuestionClassifier method collectCoarseData.

private static void collectCoarseData() throws IOException {
    Path root = Paths.get("/media/ahmetaa/depo/corpora/open-subtitles-tr-2018-small");
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectory(root);
    LinkedHashSet<String> questions = new LinkedHashSet<>();
    LinkedHashSet<String> notQuestions = new LinkedHashSet<>();
    Random rnd = new Random();
    int quesCount = 200_000;
    int noQuesCount = 300_000;
    for (TextChunk chunk : corpusProvider) {
        for (String line : chunk) {
            if (line.length() > 80) {
                continue;
            }
            if (line.endsWith("?") && questions.size() < quesCount) {
                int r = rnd.nextInt(3);
                if (r < 2) {
                    questions.add("__label__question " + line);
                } else {
                    questions.add("__label__question " + line.replaceAll("\\?", "").trim());
                }
            } else {
                if (notQuestions.size() < noQuesCount) {
                    notQuestions.add("__label__not_question " + line);
                }
            }
        }
        if (questions.size() == quesCount && notQuestions.size() == noQuesCount) {
            break;
        }
    }
    Path outQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/questions-raw");
    Files.write(outQ, questions, StandardCharsets.UTF_8);
    Path outNotQ = Paths.get("/media/ahmetaa/depo/classification/question/coarse/not_questions-raw");
    Files.write(outNotQ, questions, StandardCharsets.UTF_8);
    List<String> all = new ArrayList<>(questions);
    all.addAll(notQuestions);
    Collections.shuffle(all);
    Path allData = Paths.get("/media/ahmetaa/depo/classification/question/coarse/all-raw");
    Files.write(allData, all, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) Random(java.util.Random) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk)

Example 13 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class NormalizationVocabularyGenerator method collectVocabularyHistogram.

Vocabulary collectVocabularyHistogram(BlockTextLoader corpora, int threadCount) throws Exception {
    ExecutorService executorService = new BlockingExecutor(threadCount);
    Vocabulary result = new Vocabulary();
    for (TextChunk chunk : corpora) {
        Log.info("Processing %s", chunk);
        executorService.submit(new WordCollectorTask(chunk, result));
    }
    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.DAYS);
    return result;
}
Also used : ExecutorService(java.util.concurrent.ExecutorService) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Aggregations

TextChunk (zemberek.core.text.TextChunk)13 BlockTextLoader (zemberek.core.text.BlockTextLoader)11 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)6 LinkedHashSet (java.util.LinkedHashSet)5 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)5 IOException (java.io.IOException)4 PrintWriter (java.io.PrintWriter)4 TurkishMorphology (zemberek.morphology.TurkishMorphology)4 List (java.util.List)3 ExecutorService (java.util.concurrent.ExecutorService)3 Collectors (java.util.stream.Collectors)3 Log (zemberek.core.logging.Log)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 Files (java.nio.file.Files)2 HashSet (java.util.HashSet)2 Random (java.util.Random)2 Set (java.util.Set)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2