Search in sources :

Example 16 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method disambiguationExample.

@Test
@Ignore("Not a Test")
public void disambiguationExample() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String sentence = "Yarın kar yağacak.";
    System.out.println("Sentence  = " + sentence);
    List<WordAnalysis> analysis = morphology.analyzeSentence(sentence);
    System.out.println("Before disambiguation.");
    for (WordAnalysis entry : analysis) {
        System.out.println("Word = " + entry.getInput());
        for (SingleAnalysis single : entry) {
            System.out.println(single.formatLong());
        }
    }
    System.out.println("\nAfter disambiguation.");
    SentenceAnalysis after = morphology.disambiguate(sentence, analysis);
    after.bestAnalysis().forEach(s -> System.out.println(s.formatLong()));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 17 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method failedWordTestIssue124.

@Test
@Ignore("Not a Test.")
public void failedWordTestIssue124() throws IOException {
    Path failPath = DATA_PATH.resolve("fails.txt");
    LinkedHashSet<String> words = new LinkedHashSet<>(Files.readAllLines(failPath, StandardCharsets.UTF_8));
    LinkedHashSet<String> accepted = new LinkedHashSet<>();
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    for (String s : words) {
        WordAnalysis parses = parser.analyze(s);
        List<SingleAnalysis> analyses = parses.getAnalysisResults();
        for (SingleAnalysis parse : analyses) {
            if (parse.isUnknown() || parse.isRuntime()) {
                continue;
            }
            accepted.add(s);
        }
    }
    for (String s : accepted) {
        words.remove(s);
    }
    Path failReduced = DATA_PATH.resolve("fails-reduced.txt");
    try (PrintWriter pw = new PrintWriter(failReduced.toFile(), "utf-8")) {
        words.forEach(pw::println);
    }
    Log.info("Word count = %d Found = %d Not Found = %d", words.size(), accepted.size(), words.size() - accepted.size());
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 18 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class CategoryPredictionExperiment method generateSets.

private void generateSets(Path input, Path train, Path test, boolean useOnlyTitle, boolean useLemmas) throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WebCorpus corpus = new WebCorpus("category", "category");
    Log.info("Loading corpus from %s", input);
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    Histogram<String> categoryCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        String category = document.getCategory();
        if (category.length() > 0) {
            categoryCounts.add(category);
        }
    }
    Log.info("All category count = %d", categoryCounts.size());
    categoryCounts.removeSmaller(20);
    for (String c : categoryCounts.getSortedList()) {
        System.out.println(c + " " + categoryCounts.getCount(c));
    }
    Log.info("Reduced label count = %d", categoryCounts.size());
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    for (WebDocument document : corpus.getDocuments()) {
        if (document.getCategory().length() == 0) {
            continue;
        }
        if (useOnlyTitle && document.getTitle().length() == 0) {
            continue;
        }
        String content = document.getContentAsString();
        String title = document.getTitle();
        List<Token> docTokens = useOnlyTitle ? lexer.tokenize(title) : lexer.tokenize(content);
        List<String> reduced = new ArrayList<>(docTokens.size());
        String category = document.getCategory();
        if (categoryCounts.contains(category)) {
            category = "__label__" + document.getCategory().replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE);
        } else {
            continue;
        }
        for (Token token : docTokens) {
            if (token.getType() == Token.Type.PercentNumeral || token.getType() == Token.Type.Number || token.getType() == Token.Type.Punctuation || token.getType() == Token.Type.RomanNumeral || token.getType() == Token.Type.Time || token.getType() == Token.Type.UnknownWord || token.getType() == Token.Type.Unknown) {
                continue;
            }
            String tokenStr = token.getText();
            reduced.add(tokenStr);
        }
        String join = String.join(" ", reduced);
        if (join.trim().isEmpty()) {
            continue;
        }
        if (useLemmas) {
            SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(join);
            List<String> res = new ArrayList<>();
            for (SentenceWordAnalysis e : analysis) {
                SingleAnalysis best = e.getBestAnalysis();
                if (best.isUnknown()) {
                    res.add(e.getWordAnalysis().getInput());
                    continue;
                }
                List<String> lemmas = best.getLemmas();
                if (lemmas.size() == 0) {
                    continue;
                }
                res.add(lemmas.get(lemmas.size() - 1));
            }
            join = String.join(" ", res);
        }
        set.add("#" + document.getId() + " " + category + " " + join.replaceAll("[']", "").toLowerCase(Turkish.LOCALE));
        if (c++ % 1000 == 0) {
            Log.info("%d of %d processed.", c, corpus.documentCount());
        }
    }
    Log.info("Generate train and test set.");
    saveSets(train, test, new LinkedHashSet<>(set));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WebDocument(zemberek.corpus.WebDocument) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) WebCorpus(zemberek.corpus.WebCorpus)

Example 19 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method readmeExample1.

@Test
@Ignore("Not a Test")
public void readmeExample1() throws IOException {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    WordAnalysis results = morphology.analyze("kalemin");
    results.forEach(s -> System.out.println(s.formatLong()));
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 20 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class CorpusNerCollector method main.

public static void main(String[] args) throws IOException {
    Path corporaRoot = Paths.get("/media/ahmetaa/depo/corpora");
    Path corpusDirList = corporaRoot.resolve("ner-list");
    Path outRoot = Paths.get("/media/ahmetaa/depo/ner/out");
    Files.createDirectories(outRoot);
    BlockTextLoader corpusProvider = BlockTextLoader.fromDirectoryRoot(corporaRoot, corpusDirList, 10_000);
    // assumes you generated a model in my-model directory.
    Path modelRoot = Paths.get("my-model");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology);
    Set<String> illegal = Sets.newHashSet(".", ",", "!", "?", ":");
    List<String> lines = new ArrayList<>();
    int c = 0;
    int k = 0;
    for (TextChunk chunk : corpusProvider) {
        LinkedHashSet<String> sentences = new LinkedHashSet<>(TextCleaner.cleanAndExtractSentences(chunk.getData()));
        for (String sentence : sentences) {
            if (sentence.length() > 100) {
                continue;
            }
            NerSentence result = ner.findNamedEntities(sentence);
            int neCount = result.getNamedEntities().size();
            List<NamedEntity> nes = result.getNamedEntities();
            boolean badNamedEntity = false;
            for (NamedEntity ne : nes) {
                for (NerToken token : ne.tokens) {
                    if (illegal.contains(token.word)) {
                        badNamedEntity = true;
                        break;
                    }
                    WordAnalysis a = morphology.analyze(token.word);
                    for (SingleAnalysis analysis : a) {
                        DictionaryItem item = analysis.getDictionaryItem();
                        if (item.secondaryPos != SecondaryPos.Abbreviation && item.secondaryPos != SecondaryPos.ProperNoun) {
                            badNamedEntity = true;
                            break;
                        }
                    }
                }
                if (badNamedEntity) {
                    break;
                }
            }
            if (badNamedEntity) {
                continue;
            }
            if (neCount > 0 && neCount < 3) {
                lines.add(result.getAsTrainingSentence(AnnotationStyle.BRACKET));
                c++;
                if (c == 1000) {
                    Path out = outRoot.resolve(chunk.id + "-" + k);
                    Files.write(out, lines);
                    Log.info("%s created. ", out);
                    lines = new ArrayList<>();
                    c = 0;
                    k++;
                    if (k > 10) {
                        System.exit(0);
                    }
                }
            }
        }
    }
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) BlockTextLoader(zemberek.core.text.BlockTextLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5