Search in sources :

Example 1 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class FindPOS method test.

private void test(String s) {
    System.out.println("Sentence  = " + s);
    SentenceAnalysis analysis = analyzer.analyze(s);
    analyzer.disambiguate(analysis);
    for (SentenceAnalysis.Entry entry : analysis) {
        WordAnalysis wa = entry.parses.get(0);
        Log.info("%s -> %s : %s ", entry.input, wa.dictionaryItem.primaryPos, wa.dictionaryItem.secondaryPos);
    }
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 2 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzerTest method doParseSentencesInCorpus.

private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException {
    List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList();
    Stopwatch sw = Stopwatch.createStarted();
    long wc = 0;
    int s = 0;
    Histogram<String> unknownStuff = new Histogram<>();
    for (String sentence : sentences) {
        SentenceAnalysis parse = parser.analyze(sentence);
        for (SentenceAnalysis.Entry entry : parse) {
            List<WordAnalysis> parses = entry.parses;
            for (WordAnalysis wordAnalysis : parses) {
                if (wordAnalysis.dictionaryItem == DictionaryItem.UNKNOWN) {
                    unknownStuff.add(wordAnalysis.getSurfaceForm());
                }
            }
        }
        wc += parse.size();
        // parser.disambiguate(parse);
        s++;
        if (s % 10000 == 0) {
            System.out.println(s);
            System.out.println(sw.elapsed(TimeUnit.MILLISECONDS) / 1000d);
        }
    }
    try (PrintWriter pw = new PrintWriter("unknown.txt", "utf-8")) {
        for (String s1 : unknownStuff.getSortedList()) {
            pw.println(s1 + " " + unknownStuff.getCount(s1));
        }
    }
    System.out.println("Word count = " + wc);
    System.out.println("Elapsed Time =" + sw.elapsed(TimeUnit.MILLISECONDS));
    System.out.println("Parse and disambiguate per second = " + (wc * 1000d) / (sw.elapsed(TimeUnit.MILLISECONDS)));
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter)

Example 3 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method testSentenceAnalysis.

@Test
@Ignore("Not a Test.")
public void testSentenceAnalysis() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Z3MarkovModelDisambiguator disambiguator = new Z3MarkovModelDisambiguator();
    TurkishSentenceAnalyzer analyzer = new TurkishSentenceAnalyzer(morphology, disambiguator);
    String sentence = "Kırmızı kalemi al.";
    Log.info("Sentence  = " + sentence);
    SentenceAnalysis analysis = analyzer.analyze(sentence);
    Log.info("Before disambiguation.");
    writeParseResult(analysis);
    Log.info("\nAfter disambiguation.");
    analyzer.disambiguate(analysis);
    writeParseResult(analysis);
}
Also used : Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class GenerateDataWithRules method extractHighlyAmbigiousWordSentences.

private void extractHighlyAmbigiousWordSentences(Path inputRoot, Path outRoot, int minCount, int wordCount) throws IOException {
    List<Path> files = Files.walk(inputRoot, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    Histogram<WordAnalysis> wordAnalyses = new Histogram<>();
    for (Path file : files) {
        Log.info("Processing %s", file);
        LinkedHashSet<String> sentences = getSentences(file);
        List<List<String>> group = group(new ArrayList<>(sentences), 5000);
        for (List<String> lines : group) {
            Log.info("Collected %d words.", wordAnalyses.size());
            LinkedHashSet<String> toProcess = getAccpetableSentences(lines);
            for (String sentence : toProcess) {
                try {
                    SentenceAnalysis sentenceAnalysis = morphology.analyzeAndDisambiguate(sentence);
                    for (SentenceWordAnalysis analysis : sentenceAnalysis) {
                        HashSet<String> stems = new HashSet<>(4);
                        for (SingleAnalysis s : analysis.getWordAnalysis()) {
                            stems.add(s.getStem());
                            if (stems.size() > minCount) {
                                wordAnalyses.add(analysis.getWordAnalysis());
                                break;
                            }
                        }
                    }
                } catch (Exception e) {
                    Log.warn("Error in sentence %s", sentence);
                }
            }
        }
        if (wordAnalyses.size() > wordCount) {
            break;
        }
    }
    String s = inputRoot.toFile().getName();
    Path amb = outRoot.resolve(s + "-amb.txt");
    try (PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
        for (WordAnalysis wa : wordAnalyses.getSortedList()) {
            pwa.println(wa.getInput());
            for (SingleAnalysis analysis : wa) {
                pwa.println(analysis.formatLong());
            }
            pwa.println();
        }
    }
}
Also used : Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) Files(java.nio.file.Files) TextUtil(zemberek.core.text.TextUtil) Predicate(java.util.function.Predicate) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) Collection(java.util.Collection) IOException(java.io.IOException) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) IOException(java.io.IOException) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) PrintWriter(java.io.PrintWriter)

Example 5 with SentenceAnalysis

use of zemberek.morphology.analysis.SentenceAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method disambiguationExample.

@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String sentence = "Yarın kar yağacak.";
    System.out.println("Sentence  = " + sentence);
    List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
    System.out.println("Before disambiguation.");
    for (WordAnalysis entry : analysis) {
        System.out.println("Word = " + entry.getInput());
        for (SingleAnalysis single : entry) {
            System.out.println(single.formatLong());
        }
    }
    System.out.println("\nAfter disambiguation.");
    SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
    after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)35 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)22 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)19 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)19 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)10 PrintWriter (java.io.PrintWriter)5 Histogram (zemberek.core.collections.Histogram)5 Test (org.junit.Test)4 Token (zemberek.tokenization.Token)4 Stopwatch (com.google.common.base.Stopwatch)3 IOException (java.io.IOException)3 Path (java.nio.file.Path)3 Ignore (org.junit.Ignore)3 Log (zemberek.core.logging.Log)3 Lists (com.google.common.collect.Lists)2 Paths (java.nio.file.Paths)2 Collections (java.util.Collections)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2