Search in sources :

Example 91 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishMorphologyFunctionalTests method testForeingLocale.

@Test
public void testForeingLocale() {
    TurkishMorphology morphology = getMorphology("UNICEF [A:LocaleEn]");
    WordAnalysis result = morphology.analyze("Unicefte");
    Assert.assertEquals(1, result.analysisCount());
    morphology = getMorphology("UNICEF");
    result = morphology.analyze("Unicefte");
    Assert.assertEquals(0, result.analysisCount());
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Test(org.junit.Test)

Example 92 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method performance.

@Test
@Ignore("Not a Test.")
public void performance() throws IOException {
    List<String> lines = Files.readAllLines(// Paths.get("/media/depo/data/aaa/corpora/dunya.100k")
    Paths.get("/home/ahmetaa/data/nlp/corpora/dunya.100k"));
    TurkishMorphology analyzer = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().disableCache().build();
    Log.info(lines.size() + " lines will be processed.");
    Log.info("Dictionary has " + analyzer.getLexicon().size() + " items.");
    long tokenCount = 0;
    long tokenCountNoPunct = 0;
    Stopwatch clock = Stopwatch.createStarted();
    TurkishTokenizer lexer = TurkishTokenizer.DEFAULT;
    for (String line : lines) {
        List<Token> tokens = lexer.tokenize(line);
        tokenCount += tokens.stream().filter(s -> (s.getType() != Token.Type.SpaceTab)).count();
        tokenCountNoPunct += tokens.stream().filter(s -> (s.getType() != Token.Type.Punctuation && s.getType() != Token.Type.SpaceTab)).count();
    }
    long elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Token Count = " + tokenCount);
    Log.info("Token Count (No Punctuation) = " + tokenCountNoPunct);
    Log.info("Tokenization Speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization Speed (No Punctuation) = %.1f tokens/sec ", tokenCountNoPunct * 1000d / elapsed);
    Log.info("");
    Log.info("Sentence word analysis test:");
    int counter = 0;
    clock.reset().start();
    for (String line : lines) {
        try {
            List<WordAnalysis> res = analyzer.analyzeSentence(line);
            // for preventing VM optimizations.
            counter += res.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(analyzer.toString());
    Log.info("");
    Log.info("Disambiguation Test:");
    analyzer.invalidateCache();
    clock.reset().start();
    for (String line : lines) {
        try {
            SentenceAnalysis results = analyzer.analyzeAndDisambiguate(line);
            // for preventing VM optimizations.
            counter += results.size();
        } catch (Exception e) {
            Log.info(line);
            e.printStackTrace();
        }
    }
    elapsed = clock.elapsed(TimeUnit.MILLISECONDS);
    Log.info("Elapsed Time = " + elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed = %.1f tokens/sec", tokenCount * 1000d / elapsed);
    Log.info("Tokenization + Analysis + Disambiguation speed (no punctuation) = %.1f tokens/sec", tokenCountNoPunct * 1000d / elapsed);
    Log.info(counter);
}
Also used : WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 93 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method parseLargeVocabularyZemberek.

@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek() throws IOException {
    // Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
    Path wordFreqFile = DATA_PATH.resolve("all-counts-sorted-freq.txt");
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    Log.info("Loading histogram.");
    Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
    List<String> accepted = new ArrayList<>(histogram.size() / 3);
    int c = 0;
    for (String s : histogram) {
        try {
            WordAnalysis parses = parser.analyze(s);
            List<SingleAnalysis> analyses = parses.getAnalysisResults();
            if (analyses.size() > 0 && analyses.get(0).getDictionaryItem().primaryPos != PrimaryPos.Unknown) {
                accepted.add(s);
            }
            if (c > 0 && c % 10000 == 0) {
                Log.info("Processed = " + c);
            }
            c++;
        } catch (Exception e) {
            Log.info("Exception in %s", s);
        }
    }
    save(outDir.resolve("zemberek-parsed-words.txt"), accepted);
    sortAndSave(outDir.resolve("zemberek-parsed-words.tr.txt"), accepted);
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 94 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method guessRootsWithHeuristics.

@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
    Log.info("Loading histogram.");
    List<String> words = Files.readAllLines(wordFreqFile);
    TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
    // dictionaryLoader.load("elma");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
    Multimap<String, String> res = HashMultimap.create(100000, 3);
    int c = 0;
    for (String s : words) {
        if (s.length() < 4) {
            continue;
        }
        if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
            continue;
        }
        for (int i = 2; i < s.length(); i++) {
            String candidateRoot = s.substring(0, i + 1);
            if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
                continue;
            }
            List<DictionaryItem> items = new ArrayList<>(3);
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
            char last = candidateRoot.charAt(candidateRoot.length() - 1);
            if (i < s.length() - 1) {
                char next = s.charAt(candidateRoot.length());
                if (Turkish.Alphabet.isVowel(next)) {
                    String f = "";
                    if (last == 'b') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
                    } else if (last == 'c') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
                    } else if (last == 'ğ') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
                    }
                    if (last == 'd') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
                    }
                    if (f.length() > 0) {
                        items.add(TurkishDictionaryLoader.loadFromString(f));
                    }
                }
            }
            for (DictionaryItem item : items) {
                morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
                WordAnalysis analyze = morphology.analyze(s);
                for (SingleAnalysis wordAnalysis : analyze) {
                    if (!wordAnalysis.isUnknown()) {
                        res.put(candidateRoot, s);
                    }
                }
                morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
            }
        }
        if (++c % 10000 == 0) {
            Log.info(c);
        }
        if (c == 100000) {
            break;
        }
    }
    Log.info("Writing.");
    try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
        PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
        for (String root : res.keySet()) {
            Collection<String> vals = res.get(root);
            if (vals.size() < 2) {
                continue;
            }
            List<String> wl = new ArrayList<>(vals);
            wl.sort(turkishCollator::compare);
            pw1.println(root + " : " + String.join(", ", vals));
            pw2.println(root);
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 95 with WordAnalysis

use of zemberek.morphology.analysis.WordAnalysis in project zemberek-nlp by ahmetaa.

the class LoadProperNouns method main.

public static void main(String[] args) throws IOException {
    TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
    List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
    Histogram<String> histogram = new Histogram<>();
    Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
    for (String line : lines) {
        if (line.startsWith("_")) {
            continue;
        }
        line = line.trim();
        if (line.length() == 0) {
            continue;
        }
        String word = Strings.subStringUntilFirst(line, " ");
        int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
        word = Turkish.capitalize(word.substring(1));
        if (count < 50) {
            continue;
        }
        if (ignore.contains(word)) {
            continue;
        }
        WordAnalysis parses = parserGenerator.analyze(word);
        boolean found = false;
        for (SingleAnalysis parse : parses) {
            if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
                found = true;
            }
        }
        parserGenerator.invalidateCache();
        if (found) {
            continue;
        }
        if (word.length() < 4) {
            continue;
        }
        histogram.add(word, count);
    }
    histogram.removeSmaller(165);
    try (PrintWriter pw = new PrintWriter("proper")) {
        histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Aggregations

WordAnalysis (zemberek.morphology.analysis.WordAnalysis)96 Test (org.junit.Test)42 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)36 TurkishMorphology (zemberek.morphology.TurkishMorphology)22 ArrayList (java.util.ArrayList)21 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)19 LinkedHashSet (java.util.LinkedHashSet)13 Ignore (org.junit.Ignore)13 Histogram (zemberek.core.collections.Histogram)12 Path (java.nio.file.Path)11 PrintWriter (java.io.PrintWriter)10 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)10 IOException (java.io.IOException)6 HashSet (java.util.HashSet)6 List (java.util.List)6 WordAnalyzer (zemberek.morphology.analysis.WordAnalyzer)6 SimpleGenerator (zemberek.morphology.generator.SimpleGenerator)6 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)6 DynamicLexiconGraph (zemberek.morphology.lexicon.graph.DynamicLexiconGraph)6 Log (zemberek.core.logging.Log)5