Search in sources :

Example 21 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class HunspellOperations method generateAnnotationFileSingleSplit.

private static void generateAnnotationFileSingleSplit(Path vocab) throws IOException {
    List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> annotations = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect()) {
            Log.warn("Cannot analyze %s", word);
            continue;
        }
        LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
        for (SingleAnalysis s : analysis) {
            if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
                continue;
            }
            List<String> stems = s.getStems();
            for (String stem : stems) {
                String ending = word.substring(stem.length());
                if (!(stem + ending).equals(word)) {
                    Log.warn("Stem + Ending %s+%s does not match word %s", stem, ending, word);
                    continue;
                }
                if (ending.length() > 0) {
                    stemEndings.add(word + " " + stem + " " + ending);
                } else {
                    stemEndings.add(word + " " + stem);
                }
            }
        }
        annotations.add(String.join(",", stemEndings));
    }
    Files.write(Paths.get("data/vocabulary/annonations.txt"), annotations, StandardCharsets.UTF_8);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 22 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class HunspellOperations method filterVocab.

private static void filterVocab(Path vocabFile, Path outFile) throws IOException {
    List<String> words = Files.readAllLines(vocabFile, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> result = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect()) {
            Log.warn("Cannot analyze %s", word);
            continue;
        }
        result.add(word);
    }
    Files.write(outFile, result, StandardCharsets.UTF_8);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 23 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class DataConverter method extract.

private static void extract(Path dataPath, Path output) throws IOException {
    DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
    TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
    List<SentenceAnalysis> result = new ArrayList<>();
    Histogram<String> parseFails = new Histogram<>();
    for (SentenceData sentenceData : set) {
        // System.out.println(sentenceData.correctParse);
        List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
        if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
            continue;
        }
        List<SentenceWordAnalysis> correctList = new ArrayList<>();
        for (int i = 0; i < tokens.size(); i++) {
            String s = tokens.get(i);
            String p = sentenceData.correctParse.get(i);
            p = p.replaceAll("PCNom", "PCNOM");
            p = p.replaceAll("Pnon|Nom", "");
            p = p.replaceAll("\\+Pos\\+", "+");
            p = p.replaceAll("\\+Pos\\^DB", "^DB");
            p = p.replaceAll("[+]+", "+");
            p = p.replaceAll("[+]$", "");
            p = p.replaceAll("[+]\\^DB", "^DB");
            p = p.replaceAll("[.]", "");
            p = p.toLowerCase(Turkish.LOCALE);
            p = p.replaceAll("adverb", "adv");
            p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
            p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
            if (lookup.containsKey(p)) {
                p = lookup.get(p);
            }
            WordAnalysis a = morphology.analyze(s);
            if (!a.isCorrect()) {
                break;
            }
            SingleAnalysis best = null;
            for (SingleAnalysis analysis : a) {
                String of = convert(analysis);
                if (of.equals(p)) {
                    best = analysis;
                    break;
                }
            }
            if (best == null) {
                if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
                    String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
                    for (SingleAnalysis analysis : a) {
                        String of = convert(analysis);
                        if (of.equals(pp)) {
                            best = analysis;
                            break;
                        }
                    }
                }
            }
            if (best == null) {
                List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
                parseFails.add(s + " " + p);
            } else {
                correctList.add(new SentenceWordAnalysis(best, a));
            }
        }
        if (correctList.size() == tokens.size()) {
            result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
        }
    }
    Scripts.saveUnambiguous(result, output);
    parseFails.removeSmaller(3);
    parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
    System.out.format("Full Sentence Match  = %d in %d%n", result.size(), set.sentences.size());
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) DataSet(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSet) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) SentenceData(zemberek.morphology.old_ambiguity.AbstractDisambiguator.SentenceData) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) DataSetLoader(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSetLoader) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 24 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class WordHistogram method generateHistograms.

static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Histogram<String> roots = new Histogram<>(1000_000);
    Histogram<String> words = new Histogram<>(1000_000);
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            tokenCounter += tokens.size();
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
                    continue;
                }
                if (best.isUnknown()) {
                    continue;
                }
                if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                roots.add(best.getDictionaryItem().lemma);
                String w = e.getWordAnalysis().getInput();
                if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
                    w = w.toLowerCase(Turkish.LOCALE);
                } else {
                    w = Turkish.capitalize(w);
                }
                words.add(w);
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("sentenceCounter = " + sentenceCounter);
    Files.createDirectories(outRoot);
    roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
    roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.removeSmaller(10);
    words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 25 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class Scripts method saveUnambigious.

static void saveUnambigious() throws IOException {
    Path goldTest = Paths.get("data/gold/gold-test.sentences");
    // Path goldTest = Paths.get("data/gold/test.txt");
    Path goldTestOut = Paths.get("data/gold/gold-test.txt");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    saveUnambiguous(clean(Files.readAllLines(goldTest)), morphology, goldTestOut);
}
Also used : Path(java.nio.file.Path) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5