Search in sources :

Example 1 with SingleAnalysisSentence

use of zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence in project zemberek-nlp by ahmetaa.

the class _MorphologicalAmbiguityResolverExperiment method extracData.

public void extracData(Path p, Path outRoot, int maxAnalysisCount, int resultLimit) throws IOException {
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
    LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
    int i = 0;
    for (Path file : files) {
        List<SingleAnalysisSentence> collect = collect(file, maxAnalysisCount);
        result.addAll(collect);
        i++;
        Log.info("%d of %d", i, files.size());
        if (resultLimit > 0 && result.size() > resultLimit) {
            break;
        }
    }
    String s = p.toFile().getName();
    Path out = outRoot.resolve(s + "-unambigious.txt");
    try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
        for (SingleAnalysisSentence sentence : result) {
            pw.println(sentence.sentence);
            for (Single single : sentence.tokens) {
                for (_SingleAnalysis r : single.res) {
                    pw.println(r.formatSurfaceSequence());
                }
            }
            pw.println();
        }
    }
    // saving failed words.
    failedWords.saveSortedByKeys(outRoot.resolve(s + "-failed.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    // saving failed words by frequency.
    failedWords.saveSortedByCounts(outRoot.resolve(s + "-failed.freq.txt"), " ");
}
Also used : Path(java.nio.file.Path) Token(org.antlr.v4.runtime.Token) Strings(zemberek.core.io.Strings) HashMap(java.util.HashMap) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) ArrayList(java.util.ArrayList) InterpretingAnalyzer(zemberek.morphology._analyzer.InterpretingAnalyzer) zemberek.morphology._analyzer._SingleAnalysis(zemberek.morphology._analyzer._SingleAnalysis) Map(java.util.Map) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Objects(java.util.Objects) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) LanguageIdentifier(zemberek.langid.LanguageIdentifier) Pattern(java.util.regex.Pattern) RootLexicon(zemberek.morphology.lexicon.RootLexicon) LinkedHashSet(java.util.LinkedHashSet) zemberek.morphology._analyzer._SingleAnalysis(zemberek.morphology._analyzer._SingleAnalysis) PrintWriter(java.io.PrintWriter)

Example 2 with SingleAnalysisSentence

use of zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence in project zemberek-nlp by ahmetaa.

the class _WordCollector method extracData.

public Histogram<String> extracData(Path p, Path outRoot, int resultLimit) throws IOException {
    Histogram<String> words = new Histogram<>(5_000_000);
    List<Path> files = Files.walk(p, 1).filter(s -> s.toFile().isFile() && s.toFile().getName().endsWith(".corpus")).collect(Collectors.toList());
    LinkedHashSet<SingleAnalysisSentence> result = new LinkedHashSet<>();
    for (Path file : files) {
        Log.info("Processing %s", file);
        List<String> lines = Files.readAllLines(file, StandardCharsets.UTF_8).stream().filter(s -> !s.startsWith("<")).collect(Collectors.toList());
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraphs(lines);
        for (String sentence : sentences) {
            sentence = sentence.replaceAll("[\\s/\\-\\u00a0]+", " ");
            sentence = sentence.replaceAll("[\\u00ad]", "");
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            for (Token token : tokens) {
                String rawWord = token.getText();
                if (!Strings.containsNone(rawWord, "0123456789_")) {
                    continue;
                }
                String word = Character.isUpperCase(rawWord.charAt(0)) ? Turkish.capitalize(rawWord) : rawWord.toLowerCase(Turkish.LOCALE);
                words.add(word);
            }
        }
        Log.info("Count = %d", words.size());
    }
    String s = p.toFile().getName();
    Log.info("Saving words.");
    // saving failed words.
    words.saveSortedByKeys(outRoot.resolve(s + "-counts-sorted-name.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    // saving failed words by frequency.
    words.saveSortedByCounts(outRoot.resolve(s + "-counts-sorted-freq.txt"), " ");
    Files.write(outRoot.resolve(s + "-words-sorted-freq.txt"), words.getSortedList());
    Files.write(outRoot.resolve(s + "-words-sorted-name.txt"), words.getSortedList(Turkish.STRING_COMPARATOR_ASC));
    return words;
}
Also used : Path(java.nio.file.Path) Files(java.nio.file.Files) Strings(zemberek.core.io.Strings) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) SingleAnalysisSentence(zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence) Turkish(zemberek.core.turkish.Turkish) List(java.util.List) Token(zemberek.tokenization.Token) Paths(java.nio.file.Paths) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) SingleAnalysisSentence(zemberek.morphology._MorphologicalAmbiguityResolverExperiment.SingleAnalysisSentence) Token(zemberek.tokenization.Token)

Aggregations

IOException (java.io.IOException)2 StandardCharsets (java.nio.charset.StandardCharsets)2 Files (java.nio.file.Files)2 Path (java.nio.file.Path)2 Paths (java.nio.file.Paths)2 LinkedHashSet (java.util.LinkedHashSet)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 Histogram (zemberek.core.collections.Histogram)2 Strings (zemberek.core.io.Strings)2 Log (zemberek.core.logging.Log)2 TurkishSentenceExtractor (zemberek.tokenization.TurkishSentenceExtractor)2 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)2 PrintWriter (java.io.PrintWriter)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Objects (java.util.Objects)1 Pattern (java.util.regex.Pattern)1 Token (org.antlr.v4.runtime.Token)1