Search in sources :

Example 1 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class ProcessNormalizationCorpus method process.

void process(BlockTextLoader corpusProvider, int threadCount, Path outRoot) throws Exception {
    ExecutorService service = new BlockingExecutor(threadCount);
    AtomicInteger c = new AtomicInteger(0);
    for (TextChunk chunk : corpusProvider) {
        service.submit(() -> {
            List<String> sentences = TextCleaner.cleanAndExtractSentences(chunk.getData());
            sentences = sentences.stream().map(s -> normalizer.preProcess(s)).collect(Collectors.toList());
            Path p = outRoot.resolve(String.valueOf(c.getAndIncrement()));
            try {
                Files.write(p, sentences, StandardCharsets.UTF_8);
            } catch (IOException e) {
                e.printStackTrace();
            }
            Log.info(c.get() * BLOCK_SIZE + " Lines processed.");
        });
    }
    service.shutdown();
    service.awaitTermination(1, TimeUnit.DAYS);
}
Also used : Path(java.nio.file.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ExecutorService(java.util.concurrent.ExecutorService) TextChunk(zemberek.core.text.TextChunk) IOException(java.io.IOException) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Example 2 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class RemoveDuplicateLines method recreateCorpus.

private void recreateCorpus() throws IOException {
    int lineCounter = 0;
    int writtenLines = 0;
    try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(IOUtil.geBufferedOutputStream(output), "UTF-8"))) {
        BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
        for (TextChunk block : loader) {
            for (String line : block.getData()) {
                String l = line;
                if (normalizeLines) {
                    l = process(line);
                }
                lineCounter++;
                if (lineCounter % PROGRESS == 0) {
                    Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
                }
                long hash = longHash(l);
                if (index.get(hash) == lineCounter) {
                    if (writeCounts) {
                        writer.println(histogram.get(hash) + " " + line);
                    } else {
                        writer.println(line);
                    }
                    writtenLines++;
                }
            }
            if (count != -1 && lineCounter > count) {
                break;
            }
        }
        Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
    }
}
Also used : BlockTextLoader(zemberek.core.text.BlockTextLoader) OutputStreamWriter(java.io.OutputStreamWriter) TextChunk(zemberek.core.text.TextChunk) PrintWriter(java.io.PrintWriter)

Example 3 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class RemoveDuplicateLines method findDuplicates.

private void findDuplicates() {
    BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
    int lineCounter = 0;
    for (TextChunk block : loader) {
        for (String line : block.getData()) {
            String l = line;
            if (normalizeLines) {
                l = process(line);
            }
            totalCount++;
            lineCounter++;
            if (totalCount % PROGRESS == 0) {
                Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
            }
            long hash = longHash(l);
            if (index.containsKey(hash)) {
                duplicateCount++;
            } else {
                index.put(hash, totalCount);
            }
            histogram.increment(hash);
        }
        if (count != -1 && lineCounter > count) {
            break;
        }
    }
    Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
    Log.info("Duplicate Ratio: %.3f", duplicateCount * 100.0d / totalCount);
}
Also used : BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk)

Example 4 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class ItemFindExperiment method processCorpus.

static void processCorpus(Path in, Path out) throws IOException {
    BlockTextLoader loader = BlockTextLoader.fromPath(in, 10000);
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
        for (TextChunk chunk : loader) {
            LinkedHashSet<String> unique = new LinkedHashSet<>(chunk.getData());
            for (String l : unique) {
                if (!Strings.containsNone(l, "[]#~|")) {
                    continue;
                }
                l = l.toLowerCase(es).replaceAll("[^0-9a-zñáéíóúü]", " ").replaceAll("\\s+", " ").trim();
                if (l.length() == 0) {
                    continue;
                }
                if (l.length() < 20) {
                    continue;
                }
                pw.println(l);
            }
        }
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk) PrintWriter(java.io.PrintWriter)

Example 5 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class StemDisambiguationExperiment method doit.

private void doit() throws IOException {
    System.setProperty("org.jline.terminal.dumb", "true");
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList, "#");
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
                List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
                sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    System.out.println(chunk.size());
                }
            });
        }
        executor.shutdown();
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) TextCleaner(zemberek.normalization.TextCleaner) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) RootLexicon(zemberek.morphology.lexicon.RootLexicon) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Aggregations

TextChunk (zemberek.core.text.TextChunk)13 BlockTextLoader (zemberek.core.text.BlockTextLoader)11 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)6 LinkedHashSet (java.util.LinkedHashSet)5 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)5 IOException (java.io.IOException)4 PrintWriter (java.io.PrintWriter)4 TurkishMorphology (zemberek.morphology.TurkishMorphology)4 List (java.util.List)3 ExecutorService (java.util.concurrent.ExecutorService)3 Collectors (java.util.stream.Collectors)3 Log (zemberek.core.logging.Log)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 Files (java.nio.file.Files)2 HashSet (java.util.HashSet)2 Random (java.util.Random)2 Set (java.util.Set)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2