Search in sources :

Example 26 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class ProcessTwnertcData method main.

public static void main(String[] args) throws IOException {
    Path corpus = Paths.get("/media/ahmetaa/depo/ner/TWNERTC_All_Versions/TWNERTC_TC_Coarse_Grained_NER_DomainDependent_NoiseReduction.DUMP");
    Path nerOut = Paths.get("/media/ahmetaa/depo/ner/ner-coarse");
    Path categoryOut = Paths.get("/media/ahmetaa/depo/classification/twnertc-data");
    BlockTextLoader loader = BlockTextLoader.fromPath(corpus, 10_000);
    List<String> nerLines = new ArrayList<>();
    List<String> categoryLines = new ArrayList<>();
    Histogram<String> categories = new Histogram<>();
    for (TextChunk chunk : loader) {
        for (String line : chunk) {
            List<String> parts = TextUtil.TAB_SPLITTER.splitToList(line);
            categoryLines.add("__label__" + parts.get(0) + " " + parts.get(2));
            categories.add(parts.get(0));
            List<String> nerLabels = TextUtil.SPACE_SPLITTER.splitToList(parts.get(1));
            List<String> nerWords = TextUtil.SPACE_SPLITTER.splitToList(parts.get(2));
            if (nerLabels.size() != nerWords.size()) {
                continue;
            }
            List<NerRange> ranges = new ArrayList<>();
            NerRange range = new NerRange();
            for (int i = 0; i < nerLabels.size(); i++) {
                String lbl = nerLabels.get(i);
                String word = nerWords.get(i);
                if (lbl.equals("O")) {
                    if (range.type == null) {
                        range.type = "O";
                    } else {
                        if (range.type.equals("O")) {
                            range.seq.add(word);
                        } else {
                            ranges.add(range);
                            range = new NerRange();
                            range.type = "O";
                            range.seq.add(word);
                        }
                    }
                }
            }
        }
        Log.info(chunk.index * loader.getBlockSize());
    }
    Files.write(categoryOut, categoryLines);
    categories.saveSortedByCounts(Paths.get("/media/ahmetaa/depo/classification/categories"), " ");
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) BlockTextLoader(zemberek.core.text.BlockTextLoader) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk)

Example 27 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method splitWords.

static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
    Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
    SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
    Log.info("Language model = %s", lm.info());
    Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
    wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
    Log.info("%d words loaded.", wordFreq.size());
    wordFreq.removeSmaller(minWordCount);
    if (minWordCount > 1) {
        Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
    }
    int unkIndex = lm.getVocabulary().getUnknownWordIndex();
    try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
        PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
        for (String word : wordFreq.getSortedList()) {
            if (asciiMapKeys.contains(word)) {
                continue;
            }
            if (word.length() < 5 || word.contains("-")) {
                continue;
            }
            List<ScoredItem<String>> k = new ArrayList<>();
            for (int i = 1; i < word.length() - 1; i++) {
                String head = word.substring(0, i);
                String tail = word.substring(i);
                if (noSplitTails.contains(tail)) {
                    continue;
                }
                int hi = lm.getVocabulary().indexOf(head);
                int ti = lm.getVocabulary().indexOf(tail);
                if (hi == unkIndex || ti == unkIndex) {
                    continue;
                }
                if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
                    continue;
                }
                if (lm.ngramExists(hi, ti)) {
                    k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
                }
            }
            if (k.size() > 1) {
                k.sort((a, b) -> Double.compare(b.score, a.score));
            }
            if (k.size() > 0) {
                ScoredItem<String> best = k.get(0);
                if (best.score > -7) {
                    pw.println(word + " = " + best.item);
                    pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
                }
            }
        }
    }
}
Also used : TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) TextUtil(zemberek.core.text.TextUtil) Callable(java.util.concurrent.Callable) CompletionService(java.util.concurrent.CompletionService) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Token(zemberek.tokenization.Token) HashMultimap(com.google.common.collect.HashMultimap) Charset(java.nio.charset.Charset) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) ExecutorService(java.util.concurrent.ExecutorService) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Charsets(com.google.common.base.Charsets) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Deasciifier(zemberek.normalization.deasciifier.Deasciifier) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) LanguageIdentifier(zemberek.langid.LanguageIdentifier) SmoothLm(zemberek.lm.compression.SmoothLm) FixedBitVector(zemberek.core.collections.FixedBitVector) ScoredItem(zemberek.core.ScoredItem) RootLexicon(zemberek.morphology.lexicon.RootLexicon) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) BlockTextLoader(zemberek.core.text.BlockTextLoader) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SmoothLm(zemberek.lm.compression.SmoothLm) PrintWriter(java.io.PrintWriter)

Example 28 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class ClassificationExperiment method countTokens.

static void countTokens(Path... paths) throws IOException {
    for (Path path : paths) {
        List<String> lines = TextIO.loadLines(path);
        Histogram<String> hw = new Histogram<>();
        Histogram<String> hl = new Histogram<>();
        for (String l : lines) {
            for (String s : l.split("[\\s]+")) {
                if (s.contains("__label__")) {
                    if (s.contains("-")) {
                        Log.warn(l);
                    }
                    hl.add(s);
                } else {
                    hw.add(s);
                }
            }
        }
        Log.info("There are %d lines, %d words, %d labels in %s", lines.size(), hw.size(), hl.size(), path);
    }
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram)

Example 29 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class ConllTreebankReader method dumpStats.

public void dumpStats(List<DependencySentence> sentences, File statFile) throws IOException {
    Histogram<CoarsePosTag> coarsePos = new Histogram<>();
    Histogram<PosTag> pos = new Histogram<>();
    Histogram<DependencyRelation> depRelations = new Histogram<>();
    Histogram<String> morphItems = new Histogram<>();
    for (DependencySentence sentence : sentences) {
        for (DependencyItem item : sentence.items) {
            coarsePos.add(item.coarsePosTag);
            pos.add(item.posTag);
            depRelations.add(item.depRelation);
            morphItems.add(Lists.newArrayList(Splitter.on("|").trimResults().omitEmptyStrings().split(item.feats)));
        }
    }
    SimpleTextWriter writer = SimpleTextWriter.keepOpenUTF8Writer(statFile);
    writer.writeLine("Sentence count:" + sentences.size());
    writer.writeLine("\nCoarse POS values:\n");
    for (CoarsePosTag coarsePo : coarsePos.getSortedList()) {
        writer.writeLine(coarsePo.getAsConnlValue() + " : " + coarsePos.getCount(coarsePo));
    }
    writer.writeLine("\nPOS values:\n");
    for (PosTag posTag : pos.getSortedList()) {
        writer.writeLine(posTag.getAsConnlValue() + " : " + pos.getCount(posTag));
    }
    writer.writeLine("\nDEP Rel values:\n");
    for (DependencyRelation depRel : depRelations.getSortedList()) {
        writer.writeLine(depRel.getAsConnlString() + " : " + depRelations.getCount(depRel));
    }
    for (String morphItem : morphItems.getSortedList()) {
        writer.writeLine(morphItem + " : " + morphItems.getCount(morphItem));
    }
    writer.close();
}
Also used : Histogram(zemberek.core.collections.Histogram) SimpleTextWriter(zemberek.core.io.SimpleTextWriter)

Example 30 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class Scripts method checkWeirdChars.

private static void checkWeirdChars(Path root) throws IOException {
    List<Path> files = Files.walk(root, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    Histogram<String> chars = new Histogram<>();
    for (Path file : files) {
        System.out.println(file);
        LinkedHashSet<String> sentences = getSentences(file);
        for (String sentence : sentences) {
            for (int i = 0; i < sentence.length(); i++) {
                char c = sentence.charAt(i);
                if (c >= 0x300 && c <= 0x036f) {
                    chars.add(String.valueOf(c));
                }
                if (Scripts.undesiredChars.contains(c)) {
                    chars.add(String.valueOf(c));
                }
            }
        }
    }
    for (String s : chars.getSortedList()) {
        System.out.println(String.format("%x %d", (int) s.charAt(0), chars.getCount(s)));
    }
}
Also used : Path(java.nio.file.Path) TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) Turkish(zemberek.core.turkish.Turkish) Lists(com.google.common.collect.Lists) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter) TextCleaner(zemberek.normalization.TextCleaner) Morpheme(zemberek.morphology.morphotactics.Morpheme) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) UIntSet(zemberek.core.collections.UIntSet) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) List(java.util.List) Paths(java.nio.file.Paths) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Histogram(zemberek.core.collections.Histogram)

Aggregations

Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch (com.google.common.base.Stopwatch)11 PrintWriter (java.io.PrintWriter)11 LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException (java.io.IOException)10 Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors (java.util.stream.Collectors)10 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9