Search in sources :

Example 6 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class ItemFindExperiment method processCorpus.

static void processCorpus(Path in, Path out) throws IOException {
    BlockTextLoader loader = BlockTextLoader.fromPath(in, 10000);
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
        for (TextChunk chunk : loader) {
            LinkedHashSet<String> unique = new LinkedHashSet<>(chunk.getData());
            for (String l : unique) {
                if (!Strings.containsNone(l, "[]#~|")) {
                    continue;
                }
                l = l.toLowerCase(es).replaceAll("[^0-9a-zñáéíóúü]", " ").replaceAll("\\s+", " ").trim();
                if (l.length() == 0) {
                    continue;
                }
                if (l.length() < 20) {
                    continue;
                }
                pw.println(l);
            }
        }
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk) PrintWriter(java.io.PrintWriter)

Example 7 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class StemDisambiguationExperiment method doit.

private void doit() throws IOException {
    System.setProperty("org.jline.terminal.dumb", "true");
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList, "#");
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 30_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> data = new ArrayList<>(new LinkedHashSet<>(chunk.getData()));
                List<String> sentences = TextCleaner.cleanAndExtractSentences(data);
                sentences = sentences.stream().filter(this::unambiguous).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    System.out.println(chunk.size());
                }
            });
        }
        executor.shutdown();
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter) TextCleaner(zemberek.normalization.TextCleaner) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) RootLexicon(zemberek.morphology.lexicon.RootLexicon) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Example 8 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class PreprocessTurkishCorpus method run.

@Override
public void run() throws IOException, InterruptedException {
    List<Path> paths = new ArrayList<>();
    if (input.toFile().isFile()) {
        paths.add(input);
    } else {
        Set<String> dirNamesToProcess = new HashSet<>();
        if (dirList != null) {
            List<String> dirNames = TextIO.loadLines(dirList);
            Log.info("Directory names to process:");
            for (String dirName : dirNames) {
                Log.info(dirName);
            }
            dirNamesToProcess.addAll(dirNames);
        }
        List<Path> directories = Files.walk(input, recurse ? Integer.MAX_VALUE : 1).filter(s -> s.toFile().isDirectory() && !s.equals(input)).collect(Collectors.toList());
        for (Path directory : directories) {
            if (dirList != null && !dirNamesToProcess.contains(directory.toFile().getName())) {
                continue;
            }
            paths.addAll(Files.walk(directory, 1).filter(s -> s.toFile().isFile() && (extension == null || s.endsWith(extension))).collect(Collectors.toList()));
        }
    }
    Log.info("There are %d files to process.", paths.size());
    long totalLines = 0;
    for (Path path : paths) {
        totalLines += TextIO.lineCount(path);
    }
    final long total = totalLines;
    if (paths.size() == 0) {
        Log.info("No corpus files found for input : %s", input);
        System.exit(0);
    }
    AtomicLong sentenceCount = new AtomicLong(0);
    if (operation == Operation.LEMMA) {
        morphology = TurkishMorphology.createWithDefaults();
    }
    try (PrintWriter pw = new PrintWriter(output.toFile(), "UTF-8")) {
        BlockTextLoader loader = BlockTextLoader.fromPaths(paths, 10_000);
        BlockingExecutor executor = new BlockingExecutor(threadCount);
        AtomicInteger count = new AtomicInteger(0);
        for (TextChunk chunk : loader) {
            executor.submit(() -> {
                List<String> processed = chunk.getData().stream().filter(// ignore meta tag lines.
                s -> !s.startsWith("<")).map(TextUtil::normalizeSpacesAndSoftHyphens).collect(Collectors.toList());
                List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(processed);
                sentences = sentences.stream().filter(s -> !TextUtil.containsCombiningDiacritics(s)).map(s -> {
                    if (operation == Operation.LEMMA) {
                        return replaceWordsWithLemma(s);
                    } else {
                        return String.join(" ", TurkishTokenizer.DEFAULT.tokenizeToStrings(s));
                    }
                }).map(s -> toLowercase ? s.toLowerCase(Turkish.LOCALE) : s).collect(Collectors.toList());
                synchronized (this) {
                    sentences.forEach(pw::println);
                    sentenceCount.addAndGet(sentences.size());
                    int c = count.addAndGet(chunk.size());
                    System.out.println(String.format("(%d of %d lines) processed.", c, total));
                }
            });
        }
        executor.shutdown();
        executor.awaitTermination(1, TimeUnit.DAYS);
    }
    Log.info("%d sentences are written in %s", sentenceCount.get(), output);
}
Also used : Path(java.nio.file.Path) Parameter(com.beust.jcommander.Parameter) TextUtil(zemberek.core.text.TextUtil) ConsoleApp(zemberek.apps.ConsoleApp) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Turkish(zemberek.core.turkish.Turkish) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) ConcurrencyUtil(zemberek.core.concurrency.ConcurrencyUtil) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) TextIO(zemberek.core.text.TextIO) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Example 9 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class Dictionary method readFromFile.

static Dictionary readFromFile(Path file, final Args args) {
    Log.info("Initialize dictionary and histograms.");
    Dictionary dictionary = new Dictionary(args);
    Log.info("Loading text.");
    BlockTextLoader loader = BlockTextLoader.fromPath(file, 100_000);
    SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
    int blockCounter = 1;
    for (TextChunk lines : loader) {
        for (String line : lines) {
            List<String> split = tokenizer.splitToList(line);
            split.add(EOS);
            for (String word : split) {
                if (word.startsWith("#")) {
                    continue;
                }
                dictionary.add(word);
            }
        }
        Log.info("Lines read: %d (thousands) ", blockCounter * 100);
        blockCounter++;
    }
    Log.info("Word + Label count = %d", dictionary.words_.size());
    Log.info("Removing word and labels with small counts. Min word = %d, Min Label = %d", args.minCount, args.minCountLabel);
    // now we have the histograms. Remove based on count.
    dictionary.words_.sort((e1, e2) -> {
        if (e1.type != e2.type) {
            return Integer.compare(e1.type, e2.type);
        } else {
            return Long.compare(e2.count, e1.count);
        }
    });
    // TODO: add threshold method.
    LinkedHashSet<Entry> all = new LinkedHashSet<>(dictionary.words_);
    List<Entry> toRemove = dictionary.words_.stream().filter(s -> (s.type == TYPE_WORD && s.count < args.minCount || s.type == TYPE_LABEL && s.count < args.minCountLabel)).collect(Collectors.toList());
    all.removeAll(toRemove);
    dictionary.words_ = new ArrayList<>(all);
    dictionary.size_ = 0;
    dictionary.nwords_ = 0;
    dictionary.nlabels_ = 0;
    Arrays.fill(dictionary.word2int_, -1);
    for (Entry e : dictionary.words_) {
        int i = dictionary.find(e.word);
        dictionary.word2int_[i] = dictionary.size_++;
        if (e.type == TYPE_WORD) {
            dictionary.nwords_++;
        }
        if (e.type == TYPE_LABEL) {
            dictionary.nlabels_++;
        }
    }
    Log.info("Word count = %d , Label count = %d", dictionary.nwords(), dictionary.nlabels());
    dictionary.initTableDiscard();
    dictionary.initNgrams();
    return dictionary;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) IntIntMap(zemberek.core.collections.IntIntMap) DataInputStream(java.io.DataInputStream) Arrays(java.util.Arrays) IOException(java.io.IOException) Random(java.util.Random) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer) List(java.util.List) DataOutputStream(java.io.DataOutputStream) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Path(java.nio.file.Path) IntVector(zemberek.core.collections.IntVector) LinkedHashSet(java.util.LinkedHashSet) BlockTextLoader(zemberek.core.text.BlockTextLoader) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Example 10 with TextChunk

use of zemberek.core.text.TextChunk in project zemberek-nlp by ahmetaa.

the class ProcessTwnertcData method main.

public static void main(String[] args) throws IOException {
    Path corpus = Paths.get("/media/ahmetaa/depo/ner/TWNERTC_All_Versions/TWNERTC_TC_Coarse_Grained_NER_DomainDependent_NoiseReduction.DUMP");
    Path nerOut = Paths.get("/media/ahmetaa/depo/ner/ner-coarse");
    Path categoryOut = Paths.get("/media/ahmetaa/depo/classification/twnertc-data");
    BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
    List<String> nerLines = new ArrayList<>();
    List<String> categoryLines = new ArrayList<>();
    Histogram<String> categories = new Histogram<>();
    for (TextChunk chunk : loader) {
        for (String line : chunk) {
            List<String> parts = TextUtil.TAB_SPLITTER.splitToList(line);
            categoryLines.add("__label__" + parts.get(0) + " " + parts.get(2));
            categories.add(parts.get(0));
            List<String> nerLabels = TextUtil.SPACE_SPLITTER.splitToList(parts.get(1));
            List<String> nerWords = TextUtil.SPACE_SPLITTER.splitToList(parts.get(2));
            if (nerLabels.size() != nerWords.size()) {
                continue;
            }
            List<NerRange> ranges = new ArrayList<>();
            NerRange range = new NerRange();
            for (int i = 0; i < nerLabels.size(); i++) {
                String lbl = nerLabels.get(i);
                String word = nerWords.get(i);
                if (lbl.equals("O")) {
                    if (range.type == null) {
                        range.type = "O";
                    } else {
                        if (range.type.equals("O")) {
                            range.seq.add(word);
                        } else {
                            ranges.add(range);
                            range = new NerRange();
                            range.type = "O";
                            range.seq.add(word);
                        }
                    }
                }
            }
        }
        Log.info(chunk.index * loader.getBlockSize());
    }
    Files.write(categoryOut, categoryLines);
    categories.saveSortedByCounts(Paths.get("/media/ahmetaa/depo/classification/categories"), " ");
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk)

Aggregations

TextChunk (zemberek.core.text.TextChunk)13 BlockTextLoader (zemberek.core.text.BlockTextLoader)11 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)6 LinkedHashSet (java.util.LinkedHashSet)5 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)5 IOException (java.io.IOException)4 PrintWriter (java.io.PrintWriter)4 TurkishMorphology (zemberek.morphology.TurkishMorphology)4 List (java.util.List)3 ExecutorService (java.util.concurrent.ExecutorService)3 Collectors (java.util.stream.Collectors)3 Log (zemberek.core.logging.Log)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 Files (java.nio.file.Files)2 HashSet (java.util.HashSet)2 Random (java.util.Random)2 Set (java.util.Set)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 AtomicLong (java.util.concurrent.atomic.AtomicLong)2