Search in sources :

Example 1 with ResultSentence

use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence in project zemberek-nlp by ahmetaa.

the class GenerateDataWithRules method extractData.

private void extractData(Path p, Path outRoot, int resultLimit, int maxAmbigiousWordCount) throws IOException {
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    BatchResult result = new BatchResult();
    int i = 0;
    for (Path file : files) {
        Log.info("Processing %s", file);
        LinkedHashSet<String> sentences = getSentences(file);
        collect(result, sentences, maxAmbigiousWordCount, resultLimit);
        i++;
        Log.info("%d of %d", i, files.size());
        if (resultLimit > 0 && result.results.size() > resultLimit) {
            break;
        }
    }
    String s = p.toFile().getName();
    Log.info("Saving.");
    Path out = outRoot.resolve(s + "-rule-result.txt");
    Path amb = outRoot.resolve(s + "-rule-result-amb.txt");
    try (PrintWriter pwu = new PrintWriter(out.toFile(), "utf-8");
        PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
        for (ResultSentence sentence : result.results) {
            pwu.println("S:" + sentence.sentence);
            pwa.println("S:" + sentence.sentence);
            for (AmbiguityAnalysis analysis : sentence.results) {
                List<String> forTrain = analysis.getForTrainingOutput();
                forTrain.forEach(pwu::println);
                pwa.println(analysis.token);
                for (AnalysisDecision r : analysis.choices) {
                    pwa.println(r.analysis.formatLong());
                }
            }
            pwu.println();
            pwa.println();
        }
    }
}
Also used : Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) Files(java.nio.file.Files) TextUtil(zemberek.core.text.TextUtil) Predicate(java.util.function.Predicate) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) Collection(java.util.Collection) IOException(java.io.IOException) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) PrintWriter(java.io.PrintWriter)

Example 2 with ResultSentence

use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence in project zemberek-nlp by ahmetaa.

the class RuleBasedDisambiguatorTest method test.

@Test
public void test() throws IOException {
    // String input = "ABD Açık Serena Williams'ın";
    // String input = "Çünkü birbirine tezat oluşturuyor.";
    // String input = "O anda gördüm.";
    // String input = "Aklımıza ilk gelen emeği öncelemek.";
    // String input = "Petrolün Türkiye üzerinden dünya pazarına satılması.";
    String input = "4 Neden önemli?";
    // String input = "Sadece partimi iktidar yaptım.";
    TurkishMorphology analyzer = TurkishMorphology.createWithDefaults();
    // Rules rules = new Rules();
    // rules.pairLexRules.add(PairRule.fromLine("Aklı*|aklı* [akıl:Noun] *"));
    RuleBasedDisambiguator disambiguator = new RuleBasedDisambiguator(analyzer, Rules.fromResources());
    ResultSentence resultSentence = disambiguator.disambiguate(input);
    System.out.println(resultSentence.allIgnoredCount());
    for (AmbiguityAnalysis a : resultSentence.results) {
        a.getForTrainingOutput().forEach(System.out::println);
    }
}
Also used : AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) TurkishMorphology(zemberek.morphology.TurkishMorphology) RuleBasedDisambiguator(zemberek.morphology.ambiguity.RuleBasedDisambiguator) Test(org.junit.Test)

Example 3 with ResultSentence

use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence in project zemberek-nlp by ahmetaa.

the class GenerateDataWithRules method collect.

private void collect(BatchResult batchResult, Collection<String> sentences, int maxAmbigiousWordCount, int resultLimit) {
    List<List<String>> group = group(new ArrayList<>(sentences), 5000);
    for (List<String> strings : group) {
        LinkedHashSet<String> toProcess = getAccpetableSentences(strings);
        Log.info("Processing.. %d found.", batchResult.acceptedSentences.size());
        for (String sentence : toProcess) {
            ResultSentence r = ruleBasedDisambiguator.disambiguate(sentence);
            if (r.ambiguousWordCount() > maxAmbigiousWordCount) {
                continue;
            }
            if (r.zeroAnalysisCount() > 0) {
                continue;
            }
            if (r.allIgnoredCount() > 0) {
                Log.warn("Sentence [%s] contains word(s) that all analyses are ignored.", r.sentence);
                continue;
            }
            boolean sentenceOk = true;
            for (WordAnalysis an : r.sentenceAnalysis) {
                boolean ok = true;
                for (Predicate<WordAnalysis> predicate : acceptWordPredicates) {
                    if (!predicate.test(an)) {
                        ok = false;
                        break;
                    }
                }
                if (!ok) {
                    batchResult.ignoredSentences.add(sentence);
                    sentenceOk = false;
                    break;
                }
            }
            if (sentenceOk) {
                batchResult.acceptedSentences.add(sentence);
                batchResult.results.add(r);
                if (resultLimit > 0 && batchResult.results.size() > resultLimit) {
                    return;
                }
            }
        }
    }
}
Also used : SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) List(java.util.List) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence)

Aggregations

ResultSentence (zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 AmbiguityAnalysis (zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis)2 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)2 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)2 Lists (com.google.common.collect.Lists)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 Predicate (java.util.function.Predicate)1 Collectors (java.util.stream.Collectors)1 Test (org.junit.Test)1 Histogram (zemberek.core.collections.Histogram)1