Search in sources :

Example 6 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class ItemFindExperiment method processCorpus.

static void processCorpus(Path in, Path out) throws IOException {
    BlockTextLoader loader = BlockTextLoader.fromPath(in, 10000);
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
        for (TextChunk chunk : loader) {
            LinkedHashSet<String> unique = new LinkedHashSet<>(chunk.getData());
            for (String l : unique) {
                if (!Strings.containsNone(l, "[]#~|")) {
                    continue;
                }
                l = l.toLowerCase(es).replaceAll("[^0-9a-zñáéíóúü]", " ").replaceAll("\\s+", " ").trim();
                if (l.length() == 0) {
                    continue;
                }
                if (l.length() < 20) {
                    continue;
                }
                pw.println(l);
            }
        }
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk) PrintWriter(java.io.PrintWriter)

Example 7 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class StemDisambiguationExperiment method doit.

private void doit() throws IOException {
    System.setProperty("org.jline.terminal.dumb", "true");
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList, "#");
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
                List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
                sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    System.out.println(chunk.size());
                }
            });
        }
        executor.shutdown();
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) TextCleaner(zemberek.normalization.TextCleaner) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) RootLexicon(zemberek.morphology.lexicon.RootLexicon) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Example 8 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class Dictionary method readFromFile.

static Dictionary readFromFile(Path file, final Args args) throws IOException {
    Log.info("Initialize dictionary and histograms.");
    Dictionary dictionary = new Dictionary(args);
    Log.info("Loading text.");
    BlockTextLoader loader = new BlockTextLoader(file, 100_000);
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int blockCounter = 1;
    for (List<String> lines : loader) {
        for (String line : lines) {
            List<String> split = tokenizer.splitToList(line);
            split.add(EOS);
            for (String word : split) {
                if (word.startsWith("#")) {
                    continue;
                }
                dictionary.addWithCount(word, 1);
            }
        }
        Log.info("Lines read: %d (thousands) ", blockCounter * 100);
        blockCounter++;
    }
    Log.info("Word + Label count = %d", dictionary.words_.size());
    Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
    // now we have the histograms. Remove based on count.
    dictionary.words_.sort((e1, e2) -> {
        if (e1.type != e2.type) {
            return Integer.compare(e1.type, e2.type);
        } else {
            return Long.compare(e2.count, e1.count);
        }
    });
    LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
    List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
    all.removeAll(toRemove);
    dictionary.words_ = new ArrayList<>(all);
    dictionary.size_ = 0;
    dictionary.nwords_ = 0;
    dictionary.nlabels_ = 0;
    Arrays.fill(dictionary.word2int_, -1);
    for (Entry e : dictionary.words_) {
        int i = dictionary.find(e.word);
        dictionary.word2int_[i] = dictionary.size_++;
        if (e.type == TYPE_WORD) {
            dictionary.nwords_++;
        }
        if (e.type == TYPE_LABEL) {
            dictionary.nlabels_++;
        }
    }
    Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
    dictionary.initTableDiscard();
    Log.info("Adding character n-grams for words.");
    dictionary.initNgrams();
    Log.info("Done.");
    return dictionary;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) DataInputStream(java.io.DataInputStream) Arrays(java.util.Arrays) IOException(java.io.IOException) Random(java.util.Random) UIntIntMap(zemberek.core.collections.UIntIntMap) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) List(java.util.List) DataOutputStream(java.io.DataOutputStream) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) IntVector(zemberek.core.collections.IntVector) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Example 9 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class PreprocessTurkishCorpus method run.

@Override
public void run() throws IOException, InterruptedException {
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList);
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    final long total = totalLines;
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    if (operation == Operation.LEMMA) {
        morphology = TurkishMorphology.createWithDefaults();
    }
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        AtomicInteger count = new AtomicInteger(0);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
                s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
                List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
                sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
                    if (operation == Operation.LEMMA) {
                        return replaceWordsWithLemma(s);
                    } else {
                        return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
                    }
                }).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    int c = count.addAndGet(chunk.size());
                    System.out.println(String.format("(%d of %d lines) processed.", c, total));
                }
            });
        }
        executor.shutdown();
        executor.awaitTermination(1, TimeUnit.DAYS);
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) Parameter(com.beust.jcommander.Parameter) TextUtil(zemberek.core.text.TextUtil) ConsoleApp(zemberek.apps.ConsoleApp) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) ConcurrencyUtil(zemberek.core.concurrency.ConcurrencyUtil) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) TextIO(zemberek.core.text.TextIO) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Example 10 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class Dictionary method readFromFile.

static Dictionary readFromFile(Path file, final Args args) {
    Log.info("Initialize dictionary and histograms.");
    Dictionary dictionary = new Dictionary(args);
    Log.info("Loading text.");
    BlockTextLoader loader = BlockTextLoader.fromPath(file, 100_000);
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int blockCounter = 1;
    for (TextChunk lines : loader) {
        for (String line : lines) {
            List<String> split = tokenizer.splitToList(line);
            split.add(EOS);
            for (String word : split) {
                if (word.startsWith("#")) {
                    continue;
                }
                dictionary.add(word);
            }
        }
        Log.info("Lines read: %d (thousands) ", blockCounter * 100);
        blockCounter++;
    }
    Log.info("Word + Label count = %d", dictionary.words_.size());
    Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
    // now we have the histograms. Remove based on count.
    dictionary.words_.sort((e1, e2) -> {
        if (e1.type != e2.type) {
            return Integer.compare(e1.type, e2.type);
        } else {
            return Long.compare(e2.count, e1.count);
        }
    });
    // TODO: add threshold method.
    LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
    List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
    all.removeAll(toRemove);
    dictionary.words_ = new ArrayList<>(all);
    dictionary.size_ = 0;
    dictionary.nwords_ = 0;
    dictionary.nlabels_ = 0;
    Arrays.fill(dictionary.word2int_, -1);
    for (Entry e : dictionary.words_) {
        int i = dictionary.find(e.word);
        dictionary.word2int_[i] = dictionary.size_++;
        if (e.type == TYPE_WORD) {
            dictionary.nwords_++;
        }
        if (e.type == TYPE_LABEL) {
            dictionary.nlabels_++;
        }
    }
    Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
    dictionary.initTableDiscard();
    dictionary.initNgrams();
    return dictionary;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) IntIntMap(zemberek.core.collections.IntIntMap) DataInputStream(java.io.DataInputStream) Arrays(java.util.Arrays) IOException(java.io.IOException) Random(java.util.Random) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) List(java.util.List) DataOutputStream(java.io.DataOutputStream) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) IntVector(zemberek.core.collections.IntVector) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Aggregations

BlockTextLoader (zemberek.core.text.BlockTextLoader)15 Path (java.nio.file.Path)12 TextChunk (zemberek.core.text.TextChunk)11 ArrayList (java.util.ArrayList)7 LinkedHashSet (java.util.LinkedHashSet)6 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 IOException (java.io.IOException)4 PrintWriter (java.io.PrintWriter)4 List (java.util.List)4 Collectors (java.util.stream.Collectors)4 Log (zemberek.core.logging.Log)4 Random (java.util.Random)3 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 DataInputStream (java.io.DataInputStream)2 DataOutputStream (java.io.DataOutputStream)2 Files (java.nio.file.Files)2 Arrays (java.util.Arrays)2 HashSet (java.util.HashSet)2 Set (java.util.Set)2