Search in sources :

Example 76 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method parseLargeVocabularyZemberek.

@Test
@Ignore("Not a Test.")
public void parseLargeVocabularyZemberek() throws IOException {
    // Path wordFreqFile = DATA_PATH.resolve("vocab.all.freq");
    Path wordFreqFile = DATA_PATH.resolve("all-counts-sorted-freq.txt");
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    Log.info("Loading histogram.");
    Histogram<String> histogram = Histogram.loadFromUtf8File(wordFreqFile, ' ');
    List<String> accepted = new ArrayList<>(histogram.size() / 3);
    int c = 0;
    for (String s : histogram) {
        try {
            WordAnalysis parses = parser.analyze(s);
            List<SingleAnalysis> analyses = parses.getAnalysisResults();
            if (analyses.size() > 0 && analyses.get(0).getDictionaryItem().primaryPos != PrimaryPos.Unknown) {
                accepted.add(s);
            }
            if (c > 0 && c % 10000 == 0) {
                Log.info("Processed = " + c);
            }
            c++;
        } catch (Exception e) {
            Log.info("Exception in %s", s);
        }
    }
    save(outDir.resolve("zemberek-parsed-words.txt"), accepted);
    sortAndSave(outDir.resolve("zemberek-parsed-words.tr.txt"), accepted);
}
Also used : Path(java.nio.file.Path) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) IOException(java.io.IOException) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 77 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method generateOnlyOflazerWithAnalyzer.

@Test
@Ignore("Not a Test.")
public void generateOnlyOflazerWithAnalyzer() throws IOException {
    Path inPath = DATA_PATH.resolve("out");
    List<String> oflazer = Files.readAllLines(inPath.resolve("only-oflazer-2.txt"));
    Log.info("Oflazer Loaded. %d words.", oflazer.size());
    List<String> result = new ArrayList<>(oflazer.size() / 10);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    int i = 0;
    for (String s : oflazer) {
        if (!morphology.analyze(s).isCorrect()) {
            result.add(s);
        }
        if (i++ % 20000 == 0) {
            Log.info("%d processed.", i);
        }
    }
    Log.info("Writing.");
    Files.write(inPath.resolve("only-oflazer-3.txt"), result);
    Log.info("Oflazer-only saved.");
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 78 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method extractCircumflexWords.

@Test
@Ignore("Not a Test.")
public void extractCircumflexWords() throws IOException {
    Path inPath = DATA_PATH.resolve("out");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    LinkedHashSet<String> result = new LinkedHashSet<>();
    for (DictionaryItem i : morphology.getLexicon()) {
        if (TurkishAlphabet.INSTANCE.containsCircumflex(i.lemma)) {
            result.add(i.lemma);
        }
    }
    Log.info("Writing.");
    Files.write(inPath.resolve("words-with-circumflex.txt"), result);
}
Also used : Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 79 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method createZemberekVocabulary.

@Test
@Ignore("Not a Test.")
public void createZemberekVocabulary() throws IOException {
    Path outDir = DATA_PATH.resolve("out");
    Files.createDirectories(outDir);
    TurkishMorphology parser = TurkishMorphology.createWithDefaults();
    List<String> vocab = new ArrayList<>(parser.getLexicon().size());
    for (DictionaryItem item : parser.getLexicon()) {
        vocab.add(item.lemma);
    }
    vocab.sort(turkishCollator::compare);
    Files.write(outDir.resolve("zemberek.vocab"), vocab);
}
Also used : Path(java.nio.file.Path) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 80 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class ZemberekNlpScripts method dictionaryObsoleteCircumflexWordsCheck.

@Test
@Ignore("Not a Test.")
public void dictionaryObsoleteCircumflexWordsCheck() throws IOException {
    Path path = Paths.get("../data/vocabulary/words-with-circumflex-obsolete.txt");
    List<String> obsolete = Files.readAllLines(path, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    RootLexicon lexicon = morphology.getLexicon();
    List<String> single = new ArrayList<>();
    for (String s : obsolete) {
        List<DictionaryItem> items = lexicon.getMatchingItems(s);
        List<DictionaryItem> matchingItems = lexicon.getMatchingItems(TurkishAlphabet.INSTANCE.normalizeCircumflex(s));
        items.addAll(matchingItems);
        Log.info("%s = %s", s, items);
        if (items.size() == 1) {
            String line = items.get(0).toString();
            line = line.replace("[P:Noun]", "").trim();
            line = line.replace("[P:Noun, Prop]", "").trim();
            line = line.replace("P:Noun; ", "").trim();
            line = line.replace("P:Noun, Prop; ", "").trim();
            line = line.replace("P:Verb; ", "").trim();
            line = line.replace("[A:Voicing]", "").trim();
            single.add(line.replaceAll("\\s+", " ").trim());
        }
    }
    Path pathSingle = Paths.get("../data/vocabulary/words-with-circumflex-obsolete-single.txt");
    Files.write(pathSingle, single, StandardCharsets.UTF_8);
}
Also used : Path(java.nio.file.Path) DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5