Search in sources :

Example 6 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method processContent.

public String processContent(TurkishMorphology analyzer, String content, boolean useRoots) {
    List<Token> docTokens = lexer.tokenize(content);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr);
    }
    String joined = String.join(" ", reduced);
    if (useRoots) {
        SentenceAnalysis analysis = analyzer.analyzeAndDisambiguate(joined);
        List<String> res = new ArrayList<>();
        for (SentenceWordAnalysis e : analysis) {
            SingleAnalysis best = e.getBestAnalysis();
            if (best.isUnknown()) {
                res.add(e.getWordAnalysis().getInput());
                continue;
            }
            List<String> lemmas = best.getLemmas();
            if (lemmas.size() == 0) {
                continue;
            }
            res.add(lemmas.get(lemmas.size() - 1));
        }
        joined = String.join(" ", res);
    }
    return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 7 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    for (String c : categoryCounts.getSortedList()) {
        System.out.println(c + " " + categoryCounts.getCount(c));
    }
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (join.trim().isEmpty()) {
            continue;
        }
        if (useLemmas) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
            List<String> res = new ArrayList<>();
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.isUnknown()) {
                    res.add(e.getWordAnalysis().getInput());
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 8 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class DataConverter method extract.

private static void extract(Path dataPath, Path output) throws IOException {
    DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
    TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
    List<SentenceAnalysis> result = new ArrayList<>();
    Histogram<String> parseFails = new Histogram<>();
    for (SentenceData sentenceData : set) {
        // System.out.println(sentenceData.correctParse);
        List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
        if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
            continue;
        }
        List<SentenceWordAnalysis> correctList = new ArrayList<>();
        for (int i = 0; i < tokens.size(); i++) {
            String s = tokens.get(i);
            String p = sentenceData.correctParse.get(i);
            p = p.replaceAll("PCNom", "PCNOM");
            p = p.replaceAll("Pnon|Nom", "");
            p = p.replaceAll("\\+Pos\\+", "+");
            p = p.replaceAll("\\+Pos\\^DB", "^DB");
            p = p.replaceAll("[+]+", "+");
            p = p.replaceAll("[+]$", "");
            p = p.replaceAll("[+]\\^DB", "^DB");
            p = p.replaceAll("[.]", "");
            p = p.toLowerCase(Turkish.LOCALE);
            p = p.replaceAll("adverb", "adv");
            p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
            p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
            if (lookup.containsKey(p)) {
                p = lookup.get(p);
            }
            WordAnalysis a = morphology.analyze(s);
            if (!a.isCorrect()) {
                break;
            }
            SingleAnalysis best = null;
            for (SingleAnalysis analysis : a) {
                String of = convert(analysis);
                if (of.equals(p)) {
                    best = analysis;
                    break;
                }
            }
            if (best == null) {
                if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
                    String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
                    for (SingleAnalysis analysis : a) {
                        String of = convert(analysis);
                        if (of.equals(pp)) {
                            best = analysis;
                            break;
                        }
                    }
                }
            }
            if (best == null) {
                List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
                parseFails.add(s + " " + p);
            } else {
                correctList.add(new SentenceWordAnalysis(best, a));
            }
        }
        if (correctList.size() == tokens.size()) {
            result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
        }
    }
    Scripts.saveUnambiguous(result, output);
    parseFails.removeSmaller(3);
    parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
    System.out.format("Full Sentence Match  = %d in %d%n", result.size(), set.sentences.size());
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) DataSet(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSet) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) SentenceData(zemberek.morphology.old_ambiguity.AbstractDisambiguator.SentenceData) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) DataSetLoader(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSetLoader) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 9 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class WordHistogram method generateHistograms.

static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Histogram<String> roots = new Histogram<>(1000_000);
    Histogram<String> words = new Histogram<>(1000_000);
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            tokenCounter += tokens.size();
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
                    continue;
                }
                if (best.isUnknown()) {
                    continue;
                }
                if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                roots.add(best.getDictionaryItem().lemma);
                String w = e.getWordAnalysis().getInput();
                if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
                    w = w.toLowerCase(Turkish.LOCALE);
                } else {
                    w = Turkish.capitalize(w);
                }
                words.add(w);
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("sentenceCounter = " + sentenceCounter);
    Files.createDirectories(outRoot);
    roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
    roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.removeSmaller(10);
    words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 10 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class Scripts method saveUnambiguous.

public static void saveUnambiguous(List<String> sentences, TurkishMorphology morphology, Path out) throws IOException {
    try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
        for (String sentence : sentences) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
                continue;
            }
            pwMorph.format("S:%s%n", sentence);
            for (SentenceWordAnalysis sw : analysis) {
                WordAnalysis wa = sw.getWordAnalysis();
                pwMorph.println(wa.getInput());
                SingleAnalysis best = sw.getBestAnalysis();
                for (SingleAnalysis singleAnalysis : wa) {
                    boolean isBest = singleAnalysis.equals(best);
                    if (wa.analysisCount() == 1) {
                        pwMorph.println(singleAnalysis.formatLong());
                    } else {
                        pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                    }
                }
            }
            pwMorph.println();
        }
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)35 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)22 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)19 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)10 PrintWriter (java.io.PrintWriter)5 Histogram (zemberek.core.collections.Histogram)5 Test (org.junit.Test)4 Token (zemberek.tokenization.Token)4 Stopwatch (com.google.common.base.Stopwatch)3 IOException (java.io.IOException)3 Path (java.nio.file.Path)3 Ignore (org.junit.Ignore)3 Log (zemberek.core.logging.Log)3 Lists (com.google.common.collect.Lists)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2