Search in sources :

Example 51 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishMorphologyFunctionalTests method testEmail.

@Test
public void testEmail() {
    TurkishMorphology morphology = getMorphology();
    WordAnalysis result = morphology.analyze("foo@bar.com'a");
    Assert.assertEquals(1, result.analysisCount());
    SingleAnalysis analysis = result.getAnalysisResults().get(0);
    Assert.assertEquals(SecondaryPos.Email, analysis.getDictionaryItem().secondaryPos);
    String lexical = analysis.formatLexical();
    Assert.assertTrue(lexical.endsWith("A3sg+Dat"));
    Assert.assertEquals("foo@bar.com", analysis.getDictionaryItem().lemma);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Test(org.junit.Test)

Example 52 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class TurkishMorphologyFunctionalTests method testAsciiTolerantMorphology.

@Test
public void testAsciiTolerantMorphology() {
    // Instance with no dictionary item.
    TurkishMorphology morphology = getAsciiTolerantMorphology("sıra", "şıra", "armut", "kazan", "ekonomik [P:Adj]", "insan");
    RuleBasedAnalyzer analyzer = morphology.getAnalyzer();
    List<SingleAnalysis> result;
    result = analyzer.analyze("ekonomık");
    Assert.assertTrue(containsAllDictionaryLemma(result, "ekonomik"));
    result = analyzer.analyze("sira");
    Assert.assertEquals(2, result.size());
    Assert.assertTrue(containsAllDictionaryLemma(result, "sıra", "şıra"));
    result = analyzer.analyze("siraci");
    Assert.assertTrue(containsAllDictionaryLemma(result, "sıra", "şıra"));
    result = analyzer.analyze("armutcuga");
    Assert.assertTrue(containsAllDictionaryLemma(result, "armut"));
    result = analyzer.analyze("kazancıga");
    Assert.assertTrue(containsAllDictionaryLemma(result, "kazan"));
    result = analyzer.analyze("kazanciga");
    Assert.assertTrue(containsAllDictionaryLemma(result, "kazan"));
    result = analyzer.analyze("kazançiğimizdan");
    Assert.assertTrue(containsAllDictionaryLemma(result, "kazan"));
    result = analyzer.analyze("ınsanların");
    Assert.assertTrue(containsAllDictionaryLemma(result, "insan"));
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) RuleBasedAnalyzer(zemberek.morphology.analysis.RuleBasedAnalyzer) Test(org.junit.Test)

Example 53 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method parseLargeVocabularyZemberek.

@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek() throws IOException {
    // Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
    Path wordFreqFile = DATA_PATH.resolve("all-counts-sorted-freq.txt");
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    Log.info("Loading histogram.");
    Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
    List<String> accepted = new ArrayList<>(histogram.size() / 3);
    int c = 0;
    for (String s : histogram) {
        try {
            WordAnalysis parses = parser.analyze(s);
            List<SingleAnalysis> analyses = parses.getAnalysisResults();
            if (analyses.size() > 0 && analyses.get(0).getDictionaryItem().primaryPos != PrimaryPos.Unknown) {
                accepted.add(s);
            }
            if (c > 0 && c % 10000 == 0) {
                Log.info("Processed = " + c);
            }
            c++;
        } catch (Exception e) {
            Log.info("Exception in %s", s);
        }
    }
    save(outDir.resolve("zemberek-parsed-words.txt"), accepted);
    sortAndSave(outDir.resolve("zemberek-parsed-words.tr.txt"), accepted);
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 54 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method guessRootsWithHeuristics.

@Test
@Ignore("Not a Test.")
public void guessRootsWithHeuristics() throws IOException {
    Path wordFreqFile = DATA_PATH.resolve("out/no-parse-zemberek-freq.txt");
    Log.info("Loading histogram.");
    List<String> words = Files.readAllLines(wordFreqFile);
    TurkishDictionaryLoader dictionaryLoader = new TurkishDictionaryLoader();
    // dictionaryLoader.load("elma");
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("elma").disableCache().build();
    Multimap<String, String> res = HashMultimap.create(100000, 3);
    int c = 0;
    for (String s : words) {
        if (s.length() < 4) {
            continue;
        }
        if (!TurkishAlphabet.INSTANCE.containsVowel(s)) {
            continue;
        }
        for (int i = 2; i < s.length(); i++) {
            String candidateRoot = s.substring(0, i + 1);
            if (!TurkishAlphabet.INSTANCE.containsVowel(candidateRoot)) {
                continue;
            }
            List<DictionaryItem> items = new ArrayList<>(3);
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot));
            // assumes noun.
            items.add(TurkishDictionaryLoader.loadFromString(candidateRoot + " [P:Verb]"));
            char last = candidateRoot.charAt(candidateRoot.length() - 1);
            if (i < s.length() - 1) {
                char next = s.charAt(candidateRoot.length());
                if (Turkish.Alphabet.isVowel(next)) {
                    String f = "";
                    if (last == 'b') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'p';
                    } else if (last == 'c') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'ç';
                    } else if (last == 'ğ') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 'k';
                    }
                    if (last == 'd') {
                        f = candidateRoot.substring(0, candidateRoot.length() - 1) + 't';
                    }
                    if (f.length() > 0) {
                        items.add(TurkishDictionaryLoader.loadFromString(f));
                    }
                }
            }
            for (DictionaryItem item : items) {
                morphology.getMorphotactics().getStemTransitions().addDictionaryItem(item);
                WordAnalysis analyze = morphology.analyze(s);
                for (SingleAnalysis wordAnalysis : analyze) {
                    if (!wordAnalysis.isUnknown()) {
                        res.put(candidateRoot, s);
                    }
                }
                morphology.getMorphotactics().getStemTransitions().removeDictionaryItem(item);
            }
        }
        if (++c % 10000 == 0) {
            Log.info(c);
        }
        if (c == 100000) {
            break;
        }
    }
    Log.info("Writing.");
    try (PrintWriter pw1 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-words").toFile());
        PrintWriter pw2 = new PrintWriter(DATA_PATH.resolve("out/root-candidates-vocabulary").toFile())) {
        for (String root : res.keySet()) {
            Collection<String> vals = res.get(root);
            if (vals.size() < 2) {
                continue;
            }
            List<String> wl = new ArrayList<>(vals);
            wl.sort(turkishCollator::compare);
            pw1.println(root + " : " + String.join(", ", vals));
            pw2.println(root);
        }
    }
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishDictionaryLoader(zemberek.morphology.lexicon.tr.TurkishDictionaryLoader) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 55 with SingleAnalysis

use of zemberek.morphology.analysis.SingleAnalysis in project zemberek-nlp by ahmetaa.

the class LoadProperNouns method main.

public static void main(String[] args) throws IOException {
    TurkishMorphology parserGenerator = TurkishMorphology.createWithDefaults();
    List<String> lines = Files.readAllLines(Paths.get("/home/afsina/Downloads/documents-export-2016-02-17/vocabulary-proper-full.tr.txt"));
    Histogram<String> histogram = new Histogram<>();
    Set<String> ignore = new HashSet<>(Files.readAllLines(Paths.get("morphology/src/main/resources/tr/proper-ignore")));
    for (String line : lines) {
        if (line.startsWith("_")) {
            continue;
        }
        line = line.trim();
        if (line.length() == 0) {
            continue;
        }
        String word = Strings.subStringUntilFirst(line, " ");
        int count = Integer.parseInt(Strings.subStringAfterFirst(line, " "));
        word = Turkish.capitalize(word.substring(1));
        if (count < 50) {
            continue;
        }
        if (ignore.contains(word)) {
            continue;
        }
        WordAnalysis parses = parserGenerator.analyze(word);
        boolean found = false;
        for (SingleAnalysis parse : parses) {
            if (parse.getDictionaryItem().secondaryPos.equals(SecondaryPos.ProperNoun) && !parse.getDictionaryItem().hasAttribute(RootAttribute.Runtime)) {
                found = true;
            }
        }
        parserGenerator.invalidateCache();
        if (found) {
            continue;
        }
        if (word.length() < 4) {
            continue;
        }
        histogram.add(word, count);
    }
    histogram.removeSmaller(165);
    try (PrintWriter pw = new PrintWriter("proper")) {
        histogram.getSortedList(Turkish.STRING_COMPARATOR_ASC).forEach(pw::println);
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) PrintWriter(java.io.PrintWriter)

Aggregations

SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)55 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)38 ArrayList (java.util.ArrayList)25 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)23 TurkishMorphology (zemberek.morphology.TurkishMorphology)21 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)18 Test (org.junit.Test)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 Path (java.nio.file.Path)10 Histogram (zemberek.core.collections.Histogram)10 Token (zemberek.tokenization.Token)7 IOException (java.io.IOException)6 Ignore (org.junit.Ignore)6 Log (zemberek.core.logging.Log)6 HashSet (java.util.HashSet)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 Paths (java.nio.file.Paths)4 Files (java.nio.file.Files)3