Search in sources :

Example 1 with AnalysisDecision

use of zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision in project zemberek-nlp by ahmetaa.

the class GenerateDataWithRules method extractData.

private void extractData(Path p, Path outRoot, int resultLimit, int maxAmbigiousWordCount) throws IOException {
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile()).collect(Collectors.toList());
    BatchResult result = new BatchResult();
    int i = 0;
    for (Path file : files) {
        Log.info("Processing %s", file);
        LinkedHashSet<String> sentences = getSentences(file);
        collect(result, sentences, maxAmbigiousWordCount, resultLimit);
        i++;
        Log.info("%d of %d", i, files.size());
        if (resultLimit > 0 && result.results.size() > resultLimit) {
            break;
        }
    }
    String s = p.toFile().getName();
    Log.info("Saving.");
    Path out = outRoot.resolve(s + "-rule-result.txt");
    Path amb = outRoot.resolve(s + "-rule-result-amb.txt");
    try (PrintWriter pwu = new PrintWriter(out.toFile(), "utf-8");
        PrintWriter pwa = new PrintWriter(amb.toFile(), "utf-8")) {
        for (ResultSentence sentence : result.results) {
            pwu.println("S:" + sentence.sentence);
            pwa.println("S:" + sentence.sentence);
            for (AmbiguityAnalysis analysis : sentence.results) {
                List<String> forTrain = analysis.getForTrainingOutput();
                forTrain.forEach(pwu::println);
                pwa.println(analysis.token);
                for (AnalysisDecision r : analysis.choices) {
                    pwa.println(r.analysis.formatLong());
                }
            }
            pwu.println();
            pwa.println();
        }
    }
}
Also used : Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) Files(java.nio.file.Files) TextUtil(zemberek.core.text.TextUtil) Predicate(java.util.function.Predicate) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) Collection(java.util.Collection) IOException(java.io.IOException) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) Collectors(java.util.stream.Collectors) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) Lists(com.google.common.collect.Lists) Paths(java.nio.file.Paths) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) ResultSentence(zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence) AmbiguityAnalysis(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis) AnalysisDecision(zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision) PrintWriter(java.io.PrintWriter)

Aggregations

Lists (com.google.common.collect.Lists)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Predicate (java.util.function.Predicate)1 Collectors (java.util.stream.Collectors)1 Histogram (zemberek.core.collections.Histogram)1 Log (zemberek.core.logging.Log)1 TextUtil (zemberek.core.text.TextUtil)1 AmbiguityAnalysis (zemberek.morphology.ambiguity.RuleBasedDisambiguator.AmbiguityAnalysis)1 AnalysisDecision (zemberek.morphology.ambiguity.RuleBasedDisambiguator.AnalysisDecision)1 ResultSentence (zemberek.morphology.ambiguity.RuleBasedDisambiguator.ResultSentence)1