Search in sources :

Example 11 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    for (String c : categoryCounts.getSortedList()) {
        System.out.println(c + " " + categoryCounts.getCount(c));
    }
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (join.trim().isEmpty()) {
            continue;
        }
        if (useLemmas) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
            List<String> res = new ArrayList<>();
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.isUnknown()) {
                    res.add(e.getWordAnalysis().getInput());
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 12 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method ambiguousWordStats.

public void ambiguousWordStats(String filename) throws IOException {
    List<String> lines = readAll(filename);
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
    for (String line : lines) {
        for (String s : splitter.split(line)) {
            WordAnalysis results = parser.analyze(s);
            total++;
            if (total % 50000 == 0) {
                System.out.println("Processed: " + total);
            }
            if (results.analysisCount() > 1) {
                uniques.add(s);
            }
        }
    }
    System.out.println("Total: " + total);
    Stats st = new Stats(0.002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (st.overCutoff(count)) {
            String p1 = percentStr3(count, st.allCounts);
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count + "    " + pp(p1));
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) Splitter(com.google.common.base.Splitter) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 13 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method generateNormalizationVocabularies.

static void generateNormalizationVocabularies(TurkishMorphology morphology, Path cleanRoot, Path noisyRoot, Path outRoot) throws IOException {
    Files.createDirectories(outRoot);
    Histogram<String> correctFromNoisy = Histogram.loadFromUtf8File(noisyRoot.resolve("correct"), ' ');
    Log.info("Correct from noisy Loaded");
    Histogram<String> correctFromClean = Histogram.loadFromUtf8File(cleanRoot.resolve("correct"), ' ');
    Log.info("Correct from clean Loaded");
    correctFromClean.removeSmaller(2);
    correctFromNoisy.removeSmaller(2);
    Histogram<String> zero = new Histogram<>();
    Histogram<String> zeroWordZeroLemma = new Histogram<>();
    Histogram<String> zeroWordLowLemma = new Histogram<>();
    Histogram<String> lowFreq = new Histogram<>();
    Histogram<String> lowFreqLowLemmaFreq = new Histogram<>();
    Histogram<String> unusualProper = new Histogram<>();
    Histogram<String> unusualRoots = new Histogram<>();
    Histogram<String> ignore = new Histogram<>();
    double nTotal = correctFromNoisy.totalCount();
    double cTotal = correctFromClean.totalCount();
    for (String s : correctFromNoisy) {
        if (s.contains(".")) {
            ignore.add(s);
            continue;
        }
        int nCount = correctFromNoisy.getCount(s);
        double nFreq = nCount / nTotal;
        WordAnalysis an = morphology.analyze(s);
        if (unusualProper(an)) {
            unusualProper.add(s, correctFromNoisy.getCount(s));
            continue;
        }
        if (unusualRoot(an)) {
            unusualRoots.add(s, correctFromNoisy.getCount(s));
            continue;
        }
        if (!correctFromClean.contains(s)) {
            zero.add(s, nCount);
            if (an.analysisCount() > 0) {
                Set<String> allLemmas = new HashSet<>();
                for (SingleAnalysis analysis : an) {
                    allLemmas.addAll(analysis.getLemmas());
                }
                boolean none = true;
                boolean lowLemmaRatio = true;
                // TODO: this is not the best way. try extracting lemma frequencies from correct from clean
                for (String l : allLemmas) {
                    if (correctFromClean.contains(l)) {
                        none = false;
                        double lnf = correctFromNoisy.getCount(l) / nTotal;
                        double lcf = correctFromClean.getCount(l) / nTotal;
                        if (lnf / lcf > 10) {
                            lowLemmaRatio = false;
                            break;
                        }
                    }
                }
                if (none) {
                    zeroWordZeroLemma.add(s, nCount);
                }
                if (lowLemmaRatio) {
                    zeroWordLowLemma.add(s, nCount);
                }
            }
            continue;
        }
        double cFreq = correctFromClean.getCount(s) / cTotal;
        if (nFreq / cFreq > 30) {
            lowFreq.add(s, nCount);
        }
    }
    Log.info("Saving Possibly incorrect words.");
    zero.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero"), " ");
    zeroWordZeroLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-no-lemma"), " ");
    zeroWordLowLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-low-lemma"), " ");
    lowFreq.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-lowfreq"), " ");
    Log.info("Creating vocabularies");
    // ----------- noisy ------------
    Histogram<String> noisy = new Histogram<>(1_000_000);
    Histogram<String> noisyFromCleanCorpora = Histogram.loadFromUtf8File(cleanRoot.resolve("incorrect"), ' ');
    Histogram<String> noisyFromNoisyCorpora = Histogram.loadFromUtf8File(noisyRoot.resolve("incorrect"), ' ');
    Log.info("Incorrect words loaded.");
    noisyFromCleanCorpora.removeSmaller(2);
    noisyFromNoisyCorpora.removeSmaller(2);
    noisy.add(noisyFromCleanCorpora);
    noisy.add(noisyFromNoisyCorpora);
    Histogram<String> possiblyIncorrect = new Histogram<>(1000_000);
    possiblyIncorrect.add(zeroWordZeroLemma);
    for (String lf : lowFreq) {
        if (!possiblyIncorrect.contains(lf)) {
            possiblyIncorrect.add(lf, zeroWordZeroLemma.getCount(lf));
        }
    }
    int threshold = 2;
    for (String z : zero) {
        int c = zero.getCount(z);
        if (!possiblyIncorrect.contains(z) && c > threshold) {
            possiblyIncorrect.add(z, c);
        }
    }
    Histogram<String> clean = new Histogram<>(1000_000);
    clean.add(correctFromClean);
    clean.add(correctFromNoisy);
    for (String s : clean) {
        if (s.contains(".")) {
            ignore.add(s);
        }
    }
    clean.removeAll(ignore);
    Histogram<String> asciiDuplicates = getAsciiDuplicates(clean);
    asciiDuplicates.saveSortedByCounts(outRoot.resolve("ascii-dups"), " ");
    possiblyIncorrect.add(asciiDuplicates);
    unusualProper.saveSortedByCounts(outRoot.resolve("unusual-proper"), " ");
    for (String s : unusualProper) {
        if (!possiblyIncorrect.contains(s)) {
            possiblyIncorrect.add(s, unusualProper.getCount(s));
        }
    }
    unusualRoots.saveSortedByCounts(outRoot.resolve("unusual-root"), " ");
    for (String s : unusualRoots) {
        if (!possiblyIncorrect.contains(s)) {
            possiblyIncorrect.add(s, unusualRoots.getCount(s));
        }
    }
    possiblyIncorrect.removeAll(ignore);
    clean.removeAll(asciiDuplicates);
    clean.removeAll(unusualProper);
    clean.removeAll(unusualRoots);
    clean.removeAll(possiblyIncorrect);
    Set<String> intersectionOfKeys = noisy.getIntersectionOfKeys(clean);
    int sharedKeyCount = intersectionOfKeys.size();
    if (sharedKeyCount > 0) {
        Log.warn("Incorrect and correct sets share %d keys", sharedKeyCount);
    }
    sharedKeyCount = noisy.getIntersectionOfKeys(possiblyIncorrect).size();
    if (sharedKeyCount > 0) {
        Log.warn("Incorrect and possibly incorrect sets share %d keys", sharedKeyCount);
    }
    sharedKeyCount = clean.getIntersectionOfKeys(possiblyIncorrect).size();
    if (sharedKeyCount > 0) {
        Log.warn("Correct and possibly incorrect sets share %d keys", sharedKeyCount);
    }
    Log.info("Saving sets.");
    clean.saveSortedByCounts(outRoot.resolve("correct"), " ");
    Log.info("Correct words saved.");
    noisy.saveSortedByCounts(outRoot.resolve("incorrect"), " ");
    Log.info("Incorrect words saved.");
    possiblyIncorrect.saveSortedByCounts(outRoot.resolve("possibly-incorrect"), " ");
    Log.info("Possibly Incorrect words saved.");
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 14 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class DataConverter method extract.

private static void extract(Path dataPath, Path output) throws IOException {
    DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
    TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
    List<SentenceAnalysis> result = new ArrayList<>();
    Histogram<String> parseFails = new Histogram<>();
    for (SentenceData sentenceData : set) {
        // System.out.println(sentenceData.correctParse);
        List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
        if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
            continue;
        }
        List<SentenceWordAnalysis> correctList = new ArrayList<>();
        for (int i = 0; i < tokens.size(); i++) {
            String s = tokens.get(i);
            String p = sentenceData.correctParse.get(i);
            p = p.replaceAll("PCNom", "PCNOM");
            p = p.replaceAll("Pnon|Nom", "");
            p = p.replaceAll("\\+Pos\\+", "+");
            p = p.replaceAll("\\+Pos\\^DB", "^DB");
            p = p.replaceAll("[+]+", "+");
            p = p.replaceAll("[+]$", "");
            p = p.replaceAll("[+]\\^DB", "^DB");
            p = p.replaceAll("[.]", "");
            p = p.toLowerCase(Turkish.LOCALE);
            p = p.replaceAll("adverb", "adv");
            p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
            p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
            if (lookup.containsKey(p)) {
                p = lookup.get(p);
            }
            WordAnalysis a = morphology.analyze(s);
            if (!a.isCorrect()) {
                break;
            }
            SingleAnalysis best = null;
            for (SingleAnalysis analysis : a) {
                String of = convert(analysis);
                if (of.equals(p)) {
                    best = analysis;
                    break;
                }
            }
            if (best == null) {
                if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
                    String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
                    for (SingleAnalysis analysis : a) {
                        String of = convert(analysis);
                        if (of.equals(pp)) {
                            best = analysis;
                            break;
                        }
                    }
                }
            }
            if (best == null) {
                List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
                parseFails.add(s + " " + p);
            } else {
                correctList.add(new SentenceWordAnalysis(best, a));
            }
        }
        if (correctList.size() == tokens.size()) {
            result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
        }
    }
    Scripts.saveUnambiguous(result, output);
    parseFails.removeSmaller(3);
    parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
    System.out.format("Full Sentence Match  = %d in %d%n", result.size(), set.sentences.size());
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) DataSet(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSet) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) SentenceData(zemberek.morphology.old_ambiguity.AbstractDisambiguator.SentenceData) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) DataSetLoader(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSetLoader) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 15 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class WordHistogram method generateHistograms.

static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Histogram<String> roots = new Histogram<>(1000_000);
    Histogram<String> words = new Histogram<>(1000_000);
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            tokenCounter += tokens.size();
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
                    continue;
                }
                if (best.isUnknown()) {
                    continue;
                }
                if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                roots.add(best.getDictionaryItem().lemma);
                String w = e.getWordAnalysis().getInput();
                if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
                    w = w.toLowerCase(Turkish.LOCALE);
                } else {
                    w = Turkish.capitalize(w);
                }
                words.add(w);
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("sentenceCounter = " + sentenceCounter);
    Files.createDirectories(outRoot);
    roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
    roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.removeSmaller(10);
    words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch (com.google.common.base.Stopwatch)11 PrintWriter (java.io.PrintWriter)11 LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException (java.io.IOException)10 Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors (java.util.stream.Collectors)10 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9