Search in sources :

Example 11 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method processContent.

public String processContent(TurkishMorphology analyzer, String content, boolean useRoots) {
    List<Token> docTokens = lexer.tokenize(content);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr);
    }
    String joined = String.join(" ", reduced);
    if (useRoots) {
        SentenceAnalysis analysis = analyzer.analyzeAndDisambiguate(joined);
        List<String> res = new ArrayList<>();
        for (SentenceWordAnalysis e : analysis) {
            SingleAnalysis best = e.getBestAnalysis();
            if (best.isUnknown()) {
                res.add(e.getWordAnalysis().getInput());
                continue;
            }
            List<String> lemmas = best.getLemmas();
            if (lemmas.size() == 0) {
                continue;
            }
            res.add(lemmas.get(lemmas.size() - 1));
        }
        joined = String.join(" ", res);
    }
    return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 12 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method extractData.

public void extractData(Path p, Path outRoot, int maxAnalysisCount, int resultLimit) throws IOException {
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
    LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
    int i = 0;
    for (Path file : files) {
        List<SingleAnalysisSentence> collect = collect(file, maxAnalysisCount);
        result.addAll(collect);
        i++;
        Log.info("%d of %d", i, files.size());
        if (resultLimit > 0 && result.size() > resultLimit) {
            break;
        }
    }
    String s = p.toFile().getName();
    Path out = outRoot.resolve(s + "-ambigious.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
        for (SingleAnalysisSentence sentence : result) {
            pw.println(sentence.sentence);
            for (Single single : sentence.tokens) {
                for (SingleAnalysis r : single.res) {
                    pw.println(single.input);
                    pw.println(r.formatLong());
                }
            }
            pw.println();
        }
    }
    // saving failed words.
    failedWords.saveSortedByKeys(outRoot.resolve(s + "-failed.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    // saving failed words by frequency.
    failedWords.saveSortedByCounts(outRoot.resolve(s + "-failed.freq.txt"), " ");
}
Also used : Path(java.nio.file.Path) Strings(zemberek.core.io.Strings) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Objects(java.util.Objects) List(java.util.List) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) LanguageIdentifier(zemberek.langid.LanguageIdentifier) Pattern(java.util.regex.Pattern) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) PrintWriter(java.io.PrintWriter)

Example 13 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method collect.

private List<SingleAnalysisSentence> collect(Path p, int maxAnalysisCount) throws IOException {
    List<String> sentences = getSentences(p);
    TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
    int tokenCount = 0;
    int sentenceCount = 0;
    List<SingleAnalysisSentence> result = new ArrayList<>();
    for (String sentence : sentences) {
        sentence = sentence.replaceAll("\\s+|\\u00a0", " ");
        sentence = sentence.replaceAll("[\\u00ad]", "");
        sentence = sentence.replaceAll("[…]", "...");
        List<Single> singleAnalysisWords = new ArrayList<>();
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        boolean failed = false;
        int i = 0;
        for (Token token : tokens) {
            tokenCount++;
            String rawWord = token.getText();
            String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
            WordAnalysis results;
            if (cache.containsKey(word)) {
                results = cache.get(word);
            } else {
                results = analyzer.analyze(word);
                cache.put(word, results);
            }
            if (results.analysisCount() == 0) {
                if (Strings.containsNone(word, "0123456789-.")) {
                    failedWords.add(word);
                }
            }
            if (results.analysisCount() < 1 || results.analysisCount() > maxAnalysisCount) {
                failed = true;
                break;
            } else {
                List<SingleAnalysis> filtered = results.stream().filter(s -> !(s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun && Character.isLowerCase(rawWord.charAt(0)))).collect(Collectors.toList());
                if (filtered.size() == 0) {
                    failed = true;
                    break;
                }
                singleAnalysisWords.add(new Single(word, i, results.copyFor(filtered)));
                i++;
            }
        }
        if (!failed) {
            result.add(new SingleAnalysisSentence(sentence, singleAnalysisWords));
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d sentences %d tokens analyzed. %d found", sentenceCount, tokenCount, result.size());
        }
    }
    return result;
}
Also used : Strings(zemberek.core.io.Strings) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) Token(zemberek.tokenization.Token) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Objects(java.util.Objects) List(java.util.List) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) LanguageIdentifier(zemberek.langid.LanguageIdentifier) Pattern(java.util.regex.Pattern) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token)

Example 14 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    for (String c : categoryCounts.getSortedList()) {
        System.out.println(c + " " + categoryCounts.getCount(c));
    }
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (join.trim().isEmpty()) {
            continue;
        }
        if (useLemmas) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
            List<String> res = new ArrayList<>();
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.isUnknown()) {
                    res.add(e.getWordAnalysis().getInput());
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 15 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method generateNormalizationVocabularies.

static void generateNormalizationVocabularies(TurkishMorphology morphology, Path cleanRoot, Path noisyRoot, Path outRoot) throws IOException {
    Files.createDirectories(outRoot);
    Histogram<String> correctFromNoisy = Histogram.loadFromUtf8File(noisyRoot.resolve("correct"), ' ');
    Log.info("Correct from noisy Loaded");
    Histogram<String> correctFromClean = Histogram.loadFromUtf8File(cleanRoot.resolve("correct"), ' ');
    Log.info("Correct from clean Loaded");
    correctFromClean.removeSmaller(2);
    correctFromNoisy.removeSmaller(2);
    Histogram<String> zero = new Histogram<>();
    Histogram<String> zeroWordZeroLemma = new Histogram<>();
    Histogram<String> zeroWordLowLemma = new Histogram<>();
    Histogram<String> lowFreq = new Histogram<>();
    Histogram<String> lowFreqLowLemmaFreq = new Histogram<>();
    Histogram<String> unusualProper = new Histogram<>();
    Histogram<String> unusualRoots = new Histogram<>();
    Histogram<String> ignore = new Histogram<>();
    double nTotal = correctFromNoisy.totalCount();
    double cTotal = correctFromClean.totalCount();
    for (String s : correctFromNoisy) {
        if (s.contains(".")) {
            ignore.add(s);
            continue;
        }
        int nCount = correctFromNoisy.getCount(s);
        double nFreq = nCount / nTotal;
        WordAnalysis an = morphology.analyze(s);
        if (unusualProper(an)) {
            unusualProper.add(s, correctFromNoisy.getCount(s));
            continue;
        }
        if (unusualRoot(an)) {
            unusualRoots.add(s, correctFromNoisy.getCount(s));
            continue;
        }
        if (!correctFromClean.contains(s)) {
            zero.add(s, nCount);
            if (an.analysisCount() > 0) {
                Set<String> allLemmas = new HashSet<>();
                for (SingleAnalysis analysis : an) {
                    allLemmas.addAll(analysis.getLemmas());
                }
                boolean none = true;
                boolean lowLemmaRatio = true;
                // TODO: this is not the best way. try extracting lemma frequencies from correct from clean
                for (String l : allLemmas) {
                    if (correctFromClean.contains(l)) {
                        none = false;
                        double lnf = correctFromNoisy.getCount(l) / nTotal;
                        double lcf = correctFromClean.getCount(l) / nTotal;
                        if (lnf / lcf > 10) {
                            lowLemmaRatio = false;
                            break;
                        }
                    }
                }
                if (none) {
                    zeroWordZeroLemma.add(s, nCount);
                }
                if (lowLemmaRatio) {
                    zeroWordLowLemma.add(s, nCount);
                }
            }
            continue;
        }
        double cFreq = correctFromClean.getCount(s) / cTotal;
        if (nFreq / cFreq > 30) {
            lowFreq.add(s, nCount);
        }
    }
    Log.info("Saving Possibly incorrect words.");
    zero.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero"), " ");
    zeroWordZeroLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-no-lemma"), " ");
    zeroWordLowLemma.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-zero-low-lemma"), " ");
    lowFreq.saveSortedByCounts(noisyRoot.resolve("possibly-incorrect-lowfreq"), " ");
    Log.info("Creating vocabularies");
    // ----------- noisy ------------
    Histogram<String> noisy = new Histogram<>(1_000_000);
    Histogram<String> noisyFromCleanCorpora = Histogram.loadFromUtf8File(cleanRoot.resolve("incorrect"), ' ');
    Histogram<String> noisyFromNoisyCorpora = Histogram.loadFromUtf8File(noisyRoot.resolve("incorrect"), ' ');
    Log.info("Incorrect words loaded.");
    noisyFromCleanCorpora.removeSmaller(2);
    noisyFromNoisyCorpora.removeSmaller(2);
    noisy.add(noisyFromCleanCorpora);
    noisy.add(noisyFromNoisyCorpora);
    Histogram<String> possiblyIncorrect = new Histogram<>(1000_000);
    possiblyIncorrect.add(zeroWordZeroLemma);
    for (String lf : lowFreq) {
        if (!possiblyIncorrect.contains(lf)) {
            possiblyIncorrect.add(lf, zeroWordZeroLemma.getCount(lf));
        }
    }
    int threshold = 2;
    for (String z : zero) {
        int c = zero.getCount(z);
        if (!possiblyIncorrect.contains(z) && c > threshold) {
            possiblyIncorrect.add(z, c);
        }
    }
    Histogram<String> clean = new Histogram<>(1000_000);
    clean.add(correctFromClean);
    clean.add(correctFromNoisy);
    for (String s : clean) {
        if (s.contains(".")) {
            ignore.add(s);
        }
    }
    clean.removeAll(ignore);
    Histogram<String> asciiDuplicates = getAsciiDuplicates(clean);
    asciiDuplicates.saveSortedByCounts(outRoot.resolve("ascii-dups"), " ");
    possiblyIncorrect.add(asciiDuplicates);
    unusualProper.saveSortedByCounts(outRoot.resolve("unusual-proper"), " ");
    for (String s : unusualProper) {
        if (!possiblyIncorrect.contains(s)) {
            possiblyIncorrect.add(s, unusualProper.getCount(s));
        }
    }
    unusualRoots.saveSortedByCounts(outRoot.resolve("unusual-root"), " ");
    for (String s : unusualRoots) {
        if (!possiblyIncorrect.contains(s)) {
            possiblyIncorrect.add(s, unusualRoots.getCount(s));
        }
    }
    possiblyIncorrect.removeAll(ignore);
    clean.removeAll(asciiDuplicates);
    clean.removeAll(unusualProper);
    clean.removeAll(unusualRoots);
    clean.removeAll(possiblyIncorrect);
    Set<String> intersectionOfKeys = noisy.getIntersectionOfKeys(clean);
    int sharedKeyCount = intersectionOfKeys.size();
    if (sharedKeyCount > 0) {
        Log.warn("Incorrect and correct sets share %d keys", sharedKeyCount);
    }
    sharedKeyCount = noisy.getIntersectionOfKeys(possiblyIncorrect).size();
    if (sharedKeyCount > 0) {
        Log.warn("Incorrect and possibly incorrect sets share %d keys", sharedKeyCount);
    }
    sharedKeyCount = clean.getIntersectionOfKeys(possiblyIncorrect).size();
    if (sharedKeyCount > 0) {
        Log.warn("Correct and possibly incorrect sets share %d keys", sharedKeyCount);
    }
    Log.info("Saving sets.");
    clean.saveSortedByCounts(outRoot.resolve("correct"), " ");
    Log.info("Correct words saved.");
    noisy.saveSortedByCounts(outRoot.resolve("incorrect"), " ");
    Log.info("Incorrect words saved.");
    possiblyIncorrect.saveSortedByCounts(outRoot.resolve("possibly-incorrect"), " ");
    Log.info("Possibly Incorrect words saved.");
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)55 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)38 ArrayList (java.util.ArrayList)25 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)23 TurkishMorphology (zemberek.morphology.TurkishMorphology)21 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)18 Test (org.junit.Test)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 Path (java.nio.file.Path)10 Histogram (zemberek.core.collections.Histogram)10 Token (zemberek.tokenization.Token)7 IOException (java.io.IOException)6 Ignore (org.junit.Ignore)6 Log (zemberek.core.logging.Log)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 Paths (java.nio.file.Paths)4 Files (java.nio.file.Files)3