Search in sources :

Example 6 with TurkishSentenceAnalyzer

use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.

the class FindPOS method main.

public static void main(String[] args) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
    TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
    new FindPOS(sentenceAnalyzer).test("Keşke yarın hava güzel olsa.");
}
Also used : Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology)

Example 7 with TurkishSentenceAnalyzer

use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzerTest method setUp.

@Before
public void setUp() throws Exception {
    TurkishMorphology morphParser = TurkishMorphology.createWithDefaults();
    parser = new TurkishSentenceAnalyzer(morphParser, new Z3MarkovModelDisambiguator());
}
Also used : Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Before(org.junit.Before)

Example 8 with TurkishSentenceAnalyzer

use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method performance.

@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
    List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
    Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
    TurkishMorphology analyzer = TurkishMorphology.builder().addDefaultDictionaries().disableUnidentifiedTokenAnalyzer().disableCache().build();
    TurkishSentenceAnalyzer sentenceAnalyzer = new TurkishSentenceAnalyzer(analyzer, new Z3MarkovModelDisambiguator());
    Log.info(lines.size() + " lines will be processed.");
    Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
    long tokenCount = 0;
    long tokenCountNoPunct = 0;
    Stopwatch clock = Stopwatch.createStarted();
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    for (String line : lines) {
        List<Token> tokens = lexer.tokenize(line);
        tokenCount += tokens.stream().filter(s -> (s.getType() != TurkishLexer.SpaceTab)).count();
        tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != TurkishLexer.Punctuation && s.getType() != TurkishLexer.SpaceTab)).count();
    }
    long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Token Count = " + tokenCount);
    Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
    Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
    Log.info("");
    Log.info("Sentence word analysis test:");
    int counter = 0;
    clock.reset().start();
    for (String line : lines) {
        try {
            SentenceAnalysis res = sentenceAnalyzer.analyze(line);
            // for preventing VM optimizations.
            counter += res.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(analyzer.toString());
    Log.info("");
    Log.info("Disambiguation Test:");
    analyzer.invalidateAllCache();
    clock.reset().start();
    for (String line : lines) {
        try {
            List<WordAnalysis> results = sentenceAnalyzer.bestParse(line);
            // for preventing VM optimizations.
            counter += results.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(counter);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) IOException(java.io.IOException) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 9 with TurkishSentenceAnalyzer

use of zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method generateSetForLabelExperiment.

Set<String> generateSetForLabelExperiment(Path input, TurkishSentenceAnalyzer analyzer, boolean useRoots) throws IOException {
    WebCorpus corpus = new WebCorpus("label", "labeled");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    Log.info("Extracting data.");
    Histogram<String> labelCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        List<String> labels = document.getLabels();
        List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
        labelCounts.add(lowerCase);
    }
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
    Log.info("All label count = %d", labelCounts.size());
    labelCounts.removeSmaller(15);
    Log.info("Reduced label count = %d", labelCounts.size());
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    Set<Long> contentHash = new HashSet<>();
    for (WebDocument document : corpus.getDocuments()) {
        Long hash = document.getHash();
        if (contentHash.contains(hash)) {
            continue;
        }
        contentHash.add(hash);
        List<String> labelTags = new ArrayList<>();
        boolean labelFound = false;
        for (String label : document.getLabels()) {
            if (labelCounts.contains(label)) {
                labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
                labelFound = true;
            }
        }
        if (!labelFound) {
            continue;
        }
        String labelStr = String.join(" ", labelTags);
        String content = document.getContentAsString();
        String processed = processContent(analyzer, content, useRoots);
        if (processed.length() < 200) {
            continue;
        }
        set.add("#" + document.getId() + " " + labelStr + " " + processed);
        if (c++ % 1000 == 0) {
            Log.info("%d processed.", c);
        }
    }
    Log.info("Generate train and test set.");
    Collections.shuffle(set, new Random(1));
    return new LinkedHashSet<>(set);
}
Also used : TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Stopwatch(com.google.common.base.Stopwatch) WebCorpus(zemberek.corpus.WebCorpus) Token(org.antlr.v4.runtime.Token) Random(java.util.Random) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) WebDocument(zemberek.corpus.WebDocument) Random(java.util.Random) WebCorpus(zemberek.corpus.WebCorpus) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

Z3MarkovModelDisambiguator (zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator)9 TurkishMorphology (zemberek.morphology.analysis.tr.TurkishMorphology)9 TurkishSentenceAnalyzer (zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer)9 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)5 Token (org.antlr.v4.runtime.Token)4 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)4 Stopwatch (com.google.common.base.Stopwatch)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 Histogram (zemberek.core.collections.Histogram)3 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 WebCorpus (zemberek.corpus.WebCorpus)2 WebDocument (zemberek.corpus.WebDocument)2 PrintWriter (java.io.PrintWriter)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1