Search in sources :

Example 1 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class NoisyWordsLexiconGenerator method main.

public static void main(String[] args) throws Exception {
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 22) {
        threadCount = 22;
    }
    Path corporaRoot = Paths.get("/home/aaa/data/normalization/corpus");
    Path workDir = Paths.get("/home/aaa/data/normalization/test-large");
    Path corpusDirList = corporaRoot.resolve("all-list");
    Files.createDirectories(workDir);
    Path correct = workDir.resolve("correct");
    Path incorrect = workDir.resolve("incorrect");
    Path maybeIncorrect = workDir.resolve("possibly-incorrect");
    NormalizationVocabulary vocabulary = new NormalizationVocabulary(correct, incorrect, maybeIncorrect, 1, 3, 1);
    NoisyWordsLexiconGenerator generator = new NoisyWordsLexiconGenerator(vocabulary, threadCount);
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 50_000);
    // create graph
    Path graphPath = workDir.resolve("graph");
    generator.createGraph(corpusProvider, graphPath);
    Histogram<String> incorrectWords = Histogram.loadFromUtf8File(incorrect, ' ');
    incorrectWords.add(Histogram.loadFromUtf8File(maybeIncorrect, ' '));
    generator.createCandidates(graphPath, workDir, incorrectWords);
    Log.info("Done");
}
Also used : Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader)

Example 2 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method splitSingleFileCorpus.

static void splitSingleFileCorpus(Path in, Path outRoot) throws IOException {
    int blockSize = 100_000;
    BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
    Files.createDirectories(outRoot);
    int bc = 0;
    for (TextChunk block : loader) {
        String name = in.toFile().getName();
        Path blockPath = outRoot.resolve(name + "." + bc);
        Files.write(blockPath, block, StandardCharsets.UTF_8);
        bc++;
    }
}
Also used : Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk)

Example 3 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class CorpusNerCollector method main.

public static void main(String[] args) throws IOException {
    Path corporaRoot = Paths.get("/media/ahmetaa/depo/corpora");
    Path corpusDirList = corporaRoot.resolve("ner-list");
    Path outRoot = Paths.get("/media/ahmetaa/depo/ner/out");
    Files.createDirectories(outRoot);
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 10_000);
    // assumes you generated a model in my-model directory.
    Path modelRoot = Paths.get("my-model");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
    Set<String> illegal = Sets.newHashSet(".", ",", "!", "?", ":");
    List<String> lines = new ArrayList<>();
    int c = 0;
    int k = 0;
    for (TextChunk chunk : corpusProvider) {
        LinkedHashSet<String> sentences = new LinkedHashSet<>(TextCleaner.cleanAndExtractSentences(chunk.getData()));
        for (String sentence : sentences) {
            if (sentence.length() > 100) {
                continue;
            }
            NerSentence result = ner.findNamedEntities(sentence);
            int neCount = result.getNamedEntities().size();
            List<NamedEntity> nes = result.getNamedEntities();
            boolean badNamedEntity = false;
            for (NamedEntity ne : nes) {
                for (NerToken token : ne.tokens) {
                    if (illegal.contains(token.word)) {
                        badNamedEntity = true;
                        break;
                    }
                    WordAnalysis a = morphology.analyze(token.word);
                    for (SingleAnalysis analysis : a) {
                        DictionaryItem item = analysis.getDictionaryItem();
                        if (item.secondaryPos != SecondaryPos.Abbreviation && item.secondaryPos != SecondaryPos.ProperNoun) {
                            badNamedEntity = true;
                            break;
                        }
                    }
                }
                if (badNamedEntity) {
                    break;
                }
            }
            if (badNamedEntity) {
                continue;
            }
            if (neCount > 0 && neCount < 3) {
                lines.add(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
                c++;
                if (c == 1000) {
                    Path out = outRoot.resolve(chunk.id + "-" + k);
                    Files.write(out, lines);
                    Log.info("%s created. ", out);
                    lines = new ArrayList<>();
                    c = 0;
                    k++;
                    if (k > 10) {
                        System.exit(0);
                    }
                }
            }
        }
    }
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) BlockTextLoader(zemberek.core.text.BlockTextLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem)

Example 4 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class RemoveDuplicateLines method recreateCorpus.

private void recreateCorpus() throws IOException {
    int lineCounter = 0;
    int writtenLines = 0;
    try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(IOUtil.geBufferedOutputStream(output), "UTF-8"))) {
        BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
        for (TextChunk block : loader) {
            for (String line : block.getData()) {
                String l = line;
                if (normalizeLines) {
                    l = process(line);
                }
                lineCounter++;
                if (lineCounter % PROGRESS == 0) {
                    Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
                }
                long hash = longHash(l);
                if (index.get(hash) == lineCounter) {
                    if (writeCounts) {
                        writer.println(histogram.get(hash) + " " + line);
                    } else {
                        writer.println(line);
                    }
                    writtenLines++;
                }
            }
            if (count != -1 && lineCounter > count) {
                break;
            }
        }
        Log.info("Total lines read: %d. Lines Written: %d", lineCounter, writtenLines);
    }
}
Also used : BlockTextLoader(zemberek.core.text.BlockTextLoader) OutputStreamWriter(java.io.OutputStreamWriter) TextChunk(zemberek.core.text.TextChunk) PrintWriter(java.io.PrintWriter)

Example 5 with BlockTextLoader

use of zemberek.core.text.BlockTextLoader in project zemberek-nlp by ahmetaa.

the class RemoveDuplicateLines method findDuplicates.

private void findDuplicates() {
    BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
    int lineCounter = 0;
    for (TextChunk block : loader) {
        for (String line : block.getData()) {
            String l = line;
            if (normalizeLines) {
                l = process(line);
            }
            totalCount++;
            lineCounter++;
            if (totalCount % PROGRESS == 0) {
                Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
            }
            long hash = longHash(l);
            if (index.containsKey(hash)) {
                duplicateCount++;
            } else {
                index.put(hash, totalCount);
            }
            histogram.increment(hash);
        }
        if (count != -1 && lineCounter > count) {
            break;
        }
    }
    Log.info("Total lines read: %d. Duplicates: %d", totalCount, duplicateCount);
    Log.info("Duplicate Ratio: %.3f", duplicateCount * 100.0d / totalCount);
}
Also used : BlockTextLoader(zemberek.core.text.BlockTextLoader) TextChunk(zemberek.core.text.TextChunk)

Aggregations

BlockTextLoader (zemberek.core.text.BlockTextLoader)15 Path (java.nio.file.Path)12 TextChunk (zemberek.core.text.TextChunk)11 ArrayList (java.util.ArrayList)7 LinkedHashSet (java.util.LinkedHashSet)6 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 IOException (java.io.IOException)4 PrintWriter (java.io.PrintWriter)4 List (java.util.List)4 Collectors (java.util.stream.Collectors)4 Log (zemberek.core.logging.Log)4 Random (java.util.Random)3 BlockingExecutor (zemberek.core.concurrency.BlockingExecutor)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 DataInputStream (java.io.DataInputStream)2 DataOutputStream (java.io.DataOutputStream)2 Files (java.nio.file.Files)2 Arrays (java.util.Arrays)2 HashSet (java.util.HashSet)2 Set (java.util.Set)2