Search in sources :

Example 16 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class CorpusNerCollector method main.

public static void main(String[] args) throws IOException {
    Path corporaRoot = Paths.get("/media/ahmetaa/depo/corpora");
    Path corpusDirList = corporaRoot.resolve("ner-list");
    Path outRoot = Paths.get("/media/ahmetaa/depo/ner/out");
    Files.createDirectories(outRoot);
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 10_000);
    // assumes you generated a model in my-model directory.
    Path modelRoot = Paths.get("my-model");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
    Set<String> illegal = Sets.newHashSet(".", ",", "!", "?", ":");
    List<String> lines = new ArrayList<>();
    int c = 0;
    int k = 0;
    for (TextChunk chunk : corpusProvider) {
        LinkedHashSet<String> sentences = new LinkedHashSet<>(TextCleaner.cleanAndExtractSentences(chunk.getData()));
        for (String sentence : sentences) {
            if (sentence.length() > 100) {
                continue;
            }
            NerSentence result = ner.findNamedEntities(sentence);
            int neCount = result.getNamedEntities().size();
            List<NamedEntity> nes = result.getNamedEntities();
            boolean badNamedEntity = false;
            for (NamedEntity ne : nes) {
                for (NerToken token : ne.tokens) {
                    if (illegal.contains(token.word)) {
                        badNamedEntity = true;
                        break;
                    }
                    WordAnalysis a = morphology.analyze(token.word);
                    for (SingleAnalysis analysis : a) {
                        DictionaryItem item = analysis.getDictionaryItem();
                        if (item.secondaryPos != SecondaryPos.Abbreviation && item.secondaryPos != SecondaryPos.ProperNoun) {
                            badNamedEntity = true;
                            break;
                        }
                    }
                }
                if (badNamedEntity) {
                    break;
                }
            }
            if (badNamedEntity) {
                continue;
            }
            if (neCount > 0 && neCount < 3) {
                lines.add(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
                c++;
                if (c == 1000) {
                    Path out = outRoot.resolve(chunk.id + "-" + k);
                    Files.write(out, lines);
                    Log.info("%s created. ", out);
                    lines = new ArrayList<>();
                    c = 0;
                    k++;
                    if (k > 10) {
                        System.exit(0);
                    }
                }
            }
        }
    }
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) BlockTextLoader(zemberek.core.text.BlockTextLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem)

Example 17 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class HunspellOperations method generateAnnotationFileSingleSplit.

private static void generateAnnotationFileSingleSplit(Path vocab) throws IOException {
    List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> annotations = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect()) {
            Log.warn("Cannot analyze %s", word);
            continue;
        }
        LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
        for (SingleAnalysis s : analysis) {
            if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
                continue;
            }
            List<String> stems = s.getStems();
            for (String stem : stems) {
                String ending = word.substring(stem.length());
                if (!(stem + ending).equals(word)) {
                    Log.warn("Stem + Ending %s+%s does not match word %s", stem, ending, word);
                    continue;
                }
                if (ending.length() > 0) {
                    stemEndings.add(word + " " + stem + " " + ending);
                } else {
                    stemEndings.add(word + " " + stem);
                }
            }
        }
        annotations.add(String.join(",", stemEndings));
    }
    Files.write(Paths.get("data/vocabulary/annonations.txt"), annotations, StandardCharsets.UTF_8);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 18 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class DataConverter method extract.

private static void extract(Path dataPath, Path output) throws IOException {
    DataSet set = com.google.common.io.Files.asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
    TurkishMorphology morphology = TurkishMorphology.create(RootLexicon.builder().addTextDictionaryResources("tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict", "tr/person-names.dict").build());
    List<SentenceAnalysis> result = new ArrayList<>();
    Histogram<String> parseFails = new Histogram<>();
    for (SentenceData sentenceData : set) {
        // System.out.println(sentenceData.correctParse);
        List<String> tokens = Splitter.on(" ").splitToList(sentenceData.sentence());
        if (tokens.size() == 0 || tokens.size() != sentenceData.correctParse.size()) {
            continue;
        }
        List<SentenceWordAnalysis> correctList = new ArrayList<>();
        for (int i = 0; i < tokens.size(); i++) {
            String s = tokens.get(i);
            String p = sentenceData.correctParse.get(i);
            p = p.replaceAll("PCNom", "PCNOM");
            p = p.replaceAll("Pnon|Nom", "");
            p = p.replaceAll("\\+Pos\\+", "+");
            p = p.replaceAll("\\+Pos\\^DB", "^DB");
            p = p.replaceAll("[+]+", "+");
            p = p.replaceAll("[+]$", "");
            p = p.replaceAll("[+]\\^DB", "^DB");
            p = p.replaceAll("[.]", "");
            p = p.toLowerCase(Turkish.LOCALE);
            p = p.replaceAll("adverb", "adv");
            p = p.replaceAll("\\+cop\\+a3sg", "+a3sg+cop");
            p = p.replaceAll("\\+Unable", "^DB+Verb+Able+Neg");
            if (lookup.containsKey(p)) {
                p = lookup.get(p);
            }
            WordAnalysis a = morphology.analyze(s);
            if (!a.isCorrect()) {
                break;
            }
            SingleAnalysis best = null;
            for (SingleAnalysis analysis : a) {
                String of = convert(analysis);
                if (of.equals(p)) {
                    best = analysis;
                    break;
                }
            }
            if (best == null) {
                if (Character.isUpperCase(s.charAt(0)) && (p.contains("+noun") && !p.contains("prop"))) {
                    String pp = p.replaceFirst("\\+noun", "\\+noun+prop");
                    for (SingleAnalysis analysis : a) {
                        String of = convert(analysis);
                        if (of.equals(pp)) {
                            best = analysis;
                            break;
                        }
                    }
                }
            }
            if (best == null) {
                List<String> z = a.getAnalysisResults().stream().map(DataConverter::convert).collect(Collectors.toList());
                parseFails.add(s + " " + p);
            } else {
                correctList.add(new SentenceWordAnalysis(best, a));
            }
        }
        if (correctList.size() == tokens.size()) {
            result.add(new SentenceAnalysis(sentenceData.sentence(), correctList));
        }
    }
    Scripts.saveUnambiguous(result, output);
    parseFails.removeSmaller(3);
    parseFails.saveSortedByCounts(Paths.get("parse-fails.txt"), " ");
    System.out.format("Full Sentence Match  = %d in %d%n", result.size(), set.sentences.size());
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) DataSet(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSet) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) SentenceData(zemberek.morphology.old_ambiguity.AbstractDisambiguator.SentenceData) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) DataSetLoader(zemberek.morphology.old_ambiguity.AbstractDisambiguator.DataSetLoader) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 19 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class WordHistogram method generateHistograms.

static void generateHistograms(List<String> paragraphs, Path outRoot) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Histogram<String> roots = new Histogram<>(1000_000);
    Histogram<String> words = new Histogram<>(1000_000);
    int paragraphCounter = 0;
    int sentenceCounter = 0;
    int tokenCounter = 0;
    for (String paragraph : paragraphs) {
        List<String> sentences = TurkishSentenceExtractor.DEFAULT.fromParagraph(paragraph);
        sentenceCounter += sentences.size();
        for (String sentence : sentences) {
            List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
            tokenCounter += tokens.size();
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.getPos() == PrimaryPos.Numeral || best.getPos() == PrimaryPos.Punctuation) {
                    continue;
                }
                if (best.isUnknown()) {
                    continue;
                }
                if (best.isRuntime() && !Strings.containsNone(e.getWordAnalysis().getInput(), "01234567890")) {
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                roots.add(best.getDictionaryItem().lemma);
                String w = e.getWordAnalysis().getInput();
                if (best.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun) {
                    w = w.toLowerCase(Turkish.LOCALE);
                } else {
                    w = Turkish.capitalize(w);
                }
                words.add(w);
            }
        }
        paragraphCounter++;
        if (paragraphCounter % 1000 == 0) {
            System.out.println(paragraphCounter + " of " + paragraphs.size());
        }
    }
    System.out.println("tokenCounter = " + tokenCounter);
    System.out.println("sentenceCounter = " + sentenceCounter);
    Files.createDirectories(outRoot);
    roots.saveSortedByCounts(outRoot.resolve("roots.freq.txt"), " ");
    roots.saveSortedByKeys(outRoot.resolve("roots.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.saveSortedByCounts(outRoot.resolve("words.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
    words.removeSmaller(10);
    words.saveSortedByCounts(outRoot.resolve("words10.freq.txt"), " ");
    words.saveSortedByKeys(outRoot.resolve("words10.keys.txt"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 20 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class Scripts method saveUnambiguous.

public static void saveUnambiguous(List<String> sentences, TurkishMorphology morphology, Path out) throws IOException {
    try (PrintWriter pwMorph = new PrintWriter(out.toFile(), "utf-8")) {
        for (String sentence : sentences) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
            if (analysis.bestAnalysis().stream().anyMatch(SingleAnalysis::isUnknown)) {
                continue;
            }
            pwMorph.format("S:%s%n", sentence);
            for (SentenceWordAnalysis sw : analysis) {
                WordAnalysis wa = sw.getWordAnalysis();
                pwMorph.println(wa.getInput());
                SingleAnalysis best = sw.getBestAnalysis();
                for (SingleAnalysis singleAnalysis : wa) {
                    boolean isBest = singleAnalysis.equals(best);
                    if (wa.analysisCount() == 1) {
                        pwMorph.println(singleAnalysis.formatLong());
                    } else {
                        pwMorph.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                    }
                }
            }
            pwMorph.println();
        }
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)55 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)38 ArrayList (java.util.ArrayList)25 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)23 TurkishMorphology (zemberek.morphology.TurkishMorphology)21 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)18 Test (org.junit.Test)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 Path (java.nio.file.Path)10 Histogram (zemberek.core.collections.Histogram)10 Token (zemberek.tokenization.Token)7 IOException (java.io.IOException)6 Ignore (org.junit.Ignore)6 Log (zemberek.core.logging.Log)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 Paths (java.nio.file.Paths)4 Files (java.nio.file.Files)3