Search in sources :

Example 1 with BlockingExecutor

use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.

the class ProcessNormalizationCorpus method process.

void process(BlockTextLoader corpusProvider, int threadCount, Path outRoot) throws Exception {
    ExecutorService service = new BlockingExecutor(threadCount);
    AtomicInteger c = new AtomicInteger(0);
    for (TextChunk chunk : corpusProvider) {
        service.submit(() -> {
            List<String> sentences = TextCleaner.cleanAndExtractSentences(chunk.getData());
            sentences = sentences.stream().map(s -> normalizer.preProcess(s)).collect(Collectors.toList());
            Path p = outRoot.resolve(String.valueOf(c.getAndIncrement()));
            try {
                Files.write(p, sentences, StandardCharsets.UTF_8);
            } catch (IOException e) {
                e.printStackTrace();
            }
            Log.info(c.get() * BLOCK_SIZE + " Lines processed.");
        });
    }
    service.shutdown();
    service.awaitTermination(1, TimeUnit.DAYS);
}
Also used : Path(java.nio.file.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ExecutorService(java.util.concurrent.ExecutorService) TextChunk(zemberek.core.text.TextChunk) IOException(java.io.IOException) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Example 2 with BlockingExecutor

use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.

the class StemDisambiguationExperiment method doit.

private void doit() throws IOException {
    System.setProperty("org.jline.terminal.dumb", "true");
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList, "#");
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
                List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
                sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    System.out.println(chunk.size());
                }
            });
        }
        executor.shutdown();
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) TextCleaner(zemberek.normalization.TextCleaner) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) RootLexicon(zemberek.morphology.lexicon.RootLexicon) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Example 3 with BlockingExecutor

use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.

the class PreprocessTurkishCorpus method run.

@Override
public void run() throws IOException, InterruptedException {
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList);
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    final long total = totalLines;
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    if (operation == Operation.LEMMA) {
        morphology = TurkishMorphology.createWithDefaults();
    }
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        AtomicInteger count = new AtomicInteger(0);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
                s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
                List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
                sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
                    if (operation == Operation.LEMMA) {
                        return replaceWordsWithLemma(s);
                    } else {
                        return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
                    }
                }).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    int c = count.addAndGet(chunk.size());
                    System.out.println(String.format("(%d of %d lines) processed.", c, total));
                }
            });
        }
        executor.shutdown();
        executor.awaitTermination(1, TimeUnit.DAYS);
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) Parameter(com.beust.jcommander.Parameter) TextUtil(zemberek.core.text.TextUtil) ConsoleApp(zemberek.apps.ConsoleApp) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) ConcurrencyUtil(zemberek.core.concurrency.ConcurrencyUtil) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) TextIO(zemberek.core.text.TextIO) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Example 4 with BlockingExecutor

use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method cleanTwitterData.

static void cleanTwitterData(Path in, Path out) throws Exception {
    AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
    TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 20) {
        threadCount = 20;
    }
    ExecutorService executorService = new BlockingExecutor(threadCount);
    CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
    int blockSize = 20_000;
    BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
    Path foreign = Paths.get(out.toString() + ".foreign");
    TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
    int bc = 0;
    for (TextChunk block : loader) {
        service.submit(new TwitterTask(morphology, saver, block, bc));
        bc++;
    }
    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.DAYS);
}
Also used : AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) ExecutorService(java.util.concurrent.ExecutorService) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Example 5 with BlockingExecutor

use of zemberek.core.concurrency.BlockingExecutor in project zemberek-nlp by ahmetaa.

the class NormalizationVocabularyGenerator method collectVocabularyHistogram.

Vocabulary collectVocabularyHistogram(BlockTextLoader corpora, int threadCount) throws Exception {
    ExecutorService executorService = new BlockingExecutor(threadCount);
    Vocabulary result = new Vocabulary();
    for (TextChunk chunk : corpora) {
        Log.info("Processing %s", chunk);
        executorService.submit(new WordCollectorTask(chunk, result));
    }
    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.DAYS);
    return result;
}
Also used : ExecutorService(java.util.concurrent.ExecutorService) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Aggregations

BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)5 TextChunk (zemberek.core.text.TextChunk)5 Path (java.nio.file.Path)4 IOException (java.io.IOException)3 ExecutorService (java.util.concurrent.ExecutorService)3 BlockTextLoader (zemberek.core.text.BlockTextLoader)3 TurkishMorphology (zemberek.morphology.TurkishMorphology)3 PrintWriter (java.io.PrintWriter)2 Files (java.nio.file.Files)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Set (java.util.Set)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2 Collectors (java.util.stream.Collectors)2 Log (zemberek.core.logging.Log)2 TextIO (zemberek.core.text.TextIO)2 Turkish (zemberek.core.turkish.Turkish)2 AnalysisCache (zemberek.morphology.analysis.AnalysisCache)2