Search in sources :

Example 1 with Z3MarkovModelDisambiguator

use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.

the class WordHistogram method generateHistograms.

static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.builder().addDefaultDictionaries().cacheParameters(75_000, 150_000).build();
    TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
    Histogram<String> roots = new Histogram<>(1000_000);
    Histogram<String> words = new Histogram<>(1000_000);
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            tokenCounter += tokens.size();
            SentenceAnalysis analysis = analyzer.analyze(sentence);
            analyzer.disambiguate(analysis);
            for (SentenceAnalysis.Entry e : analysis) {
                WordAnalysis best = e.parses.get(0);
                if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
                    continue;
                }
                if (best.isUnknown()) {
                    continue;
                }
                if (best.isRuntime() && !Strings.containsNone(e.input, "01234567890")) {
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                roots.add(best.getDictionaryItem().lemma);
                String w = e.input;
                if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
                    w = w.toLowerCase(Turkish.LOCALE);
                } else {
                    w = Turkish.capitalize(w);
                }
                words.add(w);
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("sentenceCounter = " + sentenceCounter);
    Files.createDirectories(outRoot);
    roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
    roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.removeSmaller(10);
    words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology)

Example 2 with Z3MarkovModelDisambiguator

use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useRoots) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, new Z3MarkovModelDisambiguator());
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (useRoots) {
            SentenceAnalysis analysis = analyzer.analyze(join);
            analyzer.disambiguate(analysis);
            List<String> res = new ArrayList<>();
            for (SentenceAnalysis.Entry e : analysis) {
                WordAnalysis best = e.parses.get(0);
                if (best.isUnknown()) {
                    res.add(e.input);
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 3 with Z3MarkovModelDisambiguator

use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.

the class DisambiguateSentences method main.

public static void main(String[] args) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
    TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
    new DisambiguateSentences(sentenceAnalyzer).analyzeAndDisambiguate("86 lira harcardım.");
}
Also used : Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology)

Example 4 with Z3MarkovModelDisambiguator

use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method testSentenceAnalysis.

@Test
@Ignore("Not a Test.")
public void testSentenceAnalysis() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
    TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
    String sentence = "Kırmızı kalemi al.";
    Log.info("Sentence  = " + sentence);
    SentenceAnalysis analysis = analyzer.analyze(sentence);
    Log.info("Before disambiguation.");
    writeParseResult(analysis);
    Log.info("\nAfter disambiguation.");
    analyzer.disambiguate(analysis);
    writeParseResult(analysis);
}
Also used : Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with Z3MarkovModelDisambiguator

use of zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method disambiguationMemoryTest.

@Test
@Ignore("Not a Test.")
public void disambiguationMemoryTest() throws IOException {
    List<String> lines = Files.readAllLines(Paths.get("/media/depo/data/aaa/corpora/dunya.100k"));
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(parser, new Z3MarkovModelDisambiguator());
    int k = 0;
    for (int i = 0; i < 100; i++) {
        Stopwatch sw = Stopwatch.createStarted();
        for (String line : lines) {
            k += sentenceAnalyzer.bestParse(line).size();
        }
        Log.info(sw.elapsed(TimeUnit.MILLISECONDS));
    }
    Log.info(k);
}
Also used : Stopwatch(com.google.common.base.Stopwatch) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Z3MarkovModelDisambiguator (zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator)8 TurkishMorphology (zemberek.morphology.analysis.tr.TurkishMorphology)8 TurkishSentenceAnalyzer (zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer)8 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)4 Token (org.antlr.v4.runtime.Token)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)3 Stopwatch (com.google.common.base.Stopwatch)2 Histogram (zemberek.core.collections.Histogram)2 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Before (org.junit.Before)1 WebCorpus (zemberek.corpus.WebCorpus)1 WebDocument (zemberek.corpus.WebDocument)1