Search in sources :

Example 66 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method parseLargeVocabularyZemberekForMorfessor.

@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberekForMorfessor() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    Log.info("Loading histogram.");
    Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
    histogram.removeSmaller(1000);
    List<String> accepted = new ArrayList<>(histogram.size());
    int c = 0;
    for (String s : histogram) {
        s = s.trim();
        if (s.length() < 4) {
            continue;
        }
        List<WordAnalysis> parses = parser.analyze(s);
        if (parses.size() > 0 && parses.get(0).dictionaryItem.primaryPos != PrimaryPos.Unknown) {
            LinkedHashSet<String> k = new LinkedHashSet<>(2);
            for (WordAnalysis parse : parses) {
                if (parse.dictionaryItem.lemma.length() > 1) {
                    String str = parse.root + " " + String.join(" ", parse.suffixSurfaceList()).replaceAll("[ ]+", " ").trim();
                    k.add(str);
                }
            }
            String join = String.join(", ", k).trim();
            if (!s.equals(join) && join.length() > 2) {
                accepted.add(s + " " + join);
            }
        }
        if (c > 0 && c % 10000 == 0) {
            Log.info("Processed = " + c);
        }
        c++;
    }
    sortAndSave(outDir.resolve("morfessor-annotation.txt"), accepted);
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 67 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method generatorTest.

@Test
@Ignore("Not a Test.")
public void generatorTest() throws IOException {
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    List<WordAnalysis> result = parser.analyze("besiciliği");
    WordAnalysis first = result.get(0);
    Log.info(first.inflectionalGroups);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 68 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method processContent.

public String processContent(TurkishSentenceAnalyzer analyzer, String content, boolean useRoots) {
    List<Token> docTokens = lexer.tokenize(content);
    List<String> reduced = new ArrayList<>(docTokens.size());
    for (Token token : docTokens) {
        if (token.getType() == TurkishLexer.PercentNumeral || token.getType() == TurkishLexer.Number || token.getType() == TurkishLexer.Punctuation || token.getType() == TurkishLexer.RomanNumeral || token.getType() == TurkishLexer.Time || token.getType() == TurkishLexer.UnknownWord || token.getType() == TurkishLexer.Unknown) {
            continue;
        }
        String tokenStr = token.getText();
        reduced.add(tokenStr);
    }
    String joined = String.join(" ", reduced);
    if (useRoots) {
        SentenceAnalysis analysis = analyzer.analyze(joined);
        analyzer.disambiguate(analysis);
        List<String> res = new ArrayList<>();
        for (SentenceAnalysis.Entry e : analysis) {
            WordAnalysis best = e.parses.get(0);
            if (best.isUnknown()) {
                res.add(e.input);
                continue;
            }
            List<String> lemmas = best.getLemmas();
            if (lemmas.size() == 0) {
                continue;
            }
            res.add(lemmas.get(lemmas.size() - 1));
        }
        joined = String.join(" ", res);
    }
    return joined.replaceAll("[']", "").toLowerCase(Turkish.LOCALE);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Token(org.antlr.v4.runtime.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis)

Example 69 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project lucene-solr-analysis-turkish by iorixxx.

the class Zemberek3StemFilterFactory method parse.

private static void parse(String word, TurkishMorphology morphology) {
    List<WordAnalysis> results = morphology.analyze(word);
    System.out.println("Word = " + word + " has " + results.size() + " many solutions");
    if (results.size() == 0)
        return;
    System.out.println("Parses: ");
    for (WordAnalysis result : results) {
        System.out.println("number of morphemes = " + result.inflectionalGroups.size());
        System.out.println(result.formatLong());
        System.out.println("\tStems = " + result.getStems());
        System.out.println("\tLemmas = " + result.getLemmas());
        System.out.println("\tLemma = " + result.getLemma());
        System.out.println("\tRoot = " + result.getRoot());
        System.out.println("\tRoot = " + result.dictionaryItem.root);
        System.out.println("\tStemAndEnding = " + result.getStemAndEnding());
        System.out.println("-------------------");
    }
    System.out.println("final selected stem : " + Zemberek3StemFilter.stem(results, "maxLength"));
    System.out.println("==================================");
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 70 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class MorphologyConsole method run.

@Override
public void run() {
    Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault());
    if (disableUnknownAnalysis) {
        b.disableUnidentifiedTokenAnalyzer();
    }
    if (enableInformalWordAnalysis) {
        b.useInformalAnalysis();
    }
    TurkishMorphology morphology = b.build();
    String input;
    System.out.println("Enter word or sentence. Type `quit` or `Ctrl+C` to exit.:");
    Scanner sc = new Scanner(System.in);
    input = sc.nextLine();
    while (!input.equals("quit")) {
        if (input.trim().length() == 0) {
            System.out.println("Empty line cannot be processed.");
            input = sc.nextLine();
            continue;
        }
        SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
        System.out.format("%nS:%s%n", input);
        for (SentenceWordAnalysis sw : analysis) {
            WordAnalysis wa = sw.getWordAnalysis();
            System.out.println(wa.getInput());
            SingleAnalysis best = sw.getBestAnalysis();
            for (SingleAnalysis singleAnalysis : wa) {
                boolean isBest = singleAnalysis.equals(best);
                if (wa.analysisCount() == 1) {
                    System.out.println(singleAnalysis.formatLong());
                } else {
                    System.out.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                }
            }
        }
        System.out.println();
        input = sc.nextLine();
    }
}
Also used : Scanner(java.util.Scanner) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Builder(zemberek.morphology.TurkishMorphology.Builder) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5