Search in sources :

Example 1 with TextUtil

use of zemberek.core.text.TextUtil in project zemberek-nlp by ahmetaa.

the class PreprocessTurkishCorpus method run.

@Override
public void run() throws IOException, InterruptedException {
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList);
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    final long total = totalLines;
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    if (operation == Operation.LEMMA) {
        morphology = TurkishMorphology.createWithDefaults();
    }
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        AtomicInteger count = new AtomicInteger(0);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
                s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
                List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
                sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
                    if (operation == Operation.LEMMA) {
                        return replaceWordsWithLemma(s);
                    } else {
                        return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
                    }
                }).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    int c = count.addAndGet(chunk.size());
                    System.out.println(String.format("(%d of %d lines) processed.", c, total));
                }
            });
        }
        executor.shutdown();
        executor.awaitTermination(1, TimeUnit.DAYS);
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) Parameter(com.beust.jcommander.Parameter) TextUtil(zemberek.core.text.TextUtil) ConsoleApp(zemberek.apps.ConsoleApp) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) ConcurrencyUtil(zemberek.core.concurrency.ConcurrencyUtil) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) TextIO(zemberek.core.text.TextIO) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Aggregations

Parameter (com.beust.jcommander.Parameter)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Set (java.util.Set)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Collectors (java.util.stream.Collectors)1 ConsoleApp (zemberek.apps.ConsoleApp)1 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)1 ConcurrencyUtil (zemberek.core.concurrency.ConcurrencyUtil)1 Log (zemberek.core.logging.Log)1 BlockTextLoader (zemberek.core.text.BlockTextLoader)1 TextChunk (zemberek.core.text.TextChunk)1 TextIO (zemberek.core.text.TextIO)1