Search in sources :

Example 46 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class HunspellOperations method generateAnnotationFileMultiSplit.

private static void generateAnnotationFileMultiSplit(Path vocab, Path annotationsPath) throws IOException {
    List<String> words = Files.readAllLines(vocab, StandardCharsets.UTF_8);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    List<String> annotations = new ArrayList<>();
    for (String word : words) {
        WordAnalysis analysis = morphology.analyze(word);
        if (!analysis.isCorrect()) {
            Log.warn("Cannot analyze %s", word);
            continue;
        }
        LinkedHashSet<String> stemEndings = new LinkedHashSet<>();
        for (SingleAnalysis s : analysis) {
            if (s.getDictionaryItem().secondaryPos == SecondaryPos.ProperNoun || s.getDictionaryItem().secondaryPos == SecondaryPos.Abbreviation) {
                continue;
            }
            String surfaces = AnalysisFormatters.SURFACE_SEQUENCE.format(s);
            List<String> tokens = Splitter.on(" ").splitToList(surfaces);
            String stem = tokens.get(0);
            for (int i = 0; i < tokens.size(); i++) {
                String morpheme = tokens.get(i);
                if (i > 0) {
                    stem = stem + morpheme;
                }
                List<String> morphemes = i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
                String ending = String.join(" ", morphemes);
                if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
                    if (ending.length() > 0) {
                        stemEndings.add(word + " " + stem + " " + ending);
                    }
                /*else {
              stemEndings.add(word + " " + stem);
            }*/
                }
            }
        }
        annotations.add(String.join(",", stemEndings));
    }
    Files.write(annotationsPath, annotations, StandardCharsets.UTF_8);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 47 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class TurkishStopWords method generateFromDictionary.

static TurkishStopWords generateFromDictionary() throws IOException {
    Set<PrimaryPos> pos = Sets.newHashSet(PrimaryPos.Adverb, PrimaryPos.Conjunction, PrimaryPos.Determiner, PrimaryPos.Interjection, PrimaryPos.PostPositive, PrimaryPos.Numeral, PrimaryPos.Pronoun, PrimaryPos.Question);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Set<String> set = new HashSet<>();
    RootLexicon lexicon = morphology.getLexicon();
    for (DictionaryItem item : lexicon) {
        if (pos.contains(item.primaryPos)) {
            set.add(item.lemma);
        }
    }
    List<String> str = new ArrayList<>(set);
    str.sort(Turkish.STRING_COMPARATOR_ASC);
    return new TurkishStopWords(new LinkedHashSet<>(str));
}
Also used : DictionaryItem(zemberek.morphology.lexicon.DictionaryItem) PrimaryPos(zemberek.core.turkish.PrimaryPos) ArrayList(java.util.ArrayList) RootLexicon(zemberek.morphology.lexicon.RootLexicon) TurkishMorphology(zemberek.morphology.TurkishMorphology) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 48 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class GenerateNerModel method main.

public static void main(String[] args) throws IOException {
    // you will need ner-train and ner-test files to run this example.
    Path trainPath = Paths.get("ner-train");
    Path testPath = Paths.get("ner-test");
    Path modelRoot = Paths.get("my-model");
    NerDataSet trainingSet = NerDataSet.load(trainPath, AnnotationStyle.BRACKET);
    // prints information
    Log.info(trainingSet.info());
    NerDataSet testSet = NerDataSet.load(testPath, AnnotationStyle.BRACKET);
    Log.info(testSet.info());
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    // Training occurs here. Result is a PerceptronNer instance.
    // There will be 7 iterations with 0.1 learning rate.
    PerceptronNer ner = new PerceptronNerTrainer(morphology).train(trainingSet, testSet, 13, 0.1f);
    Files.createDirectories(modelRoot);
    ner.saveModelAsText(modelRoot);
}
Also used : Path(java.nio.file.Path) NerDataSet(zemberek.ner.NerDataSet) PerceptronNer(zemberek.ner.PerceptronNer) PerceptronNerTrainer(zemberek.ner.PerceptronNerTrainer) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 49 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class NormalizeNoisyText method main.

public static void main(String[] args) throws IOException {
    String[] examples = { "Yrn okua gidicem", "Tmm, yarin havuza giricem ve aksama kadar yaticam :)", "ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo", "gercek mı bu? Yuh! Artık unutulması bile beklenmiyo", "Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.", "yok hocam kesınlıkle oyle birşey yok", "herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa" };
    // change paths with your normalization data root folder and language model file paths.
    // Example: https://drive.google.com/drive/folders/1tztjRiUs9BOTH-tb1v7FWyixl-iUpydW
    // download lm and normalization folders to some local directory.
    Path zemberekDataRoot = Paths.get("/home/aaa/zemberek-data");
    Path lookupRoot = zemberekDataRoot.resolve("normalization");
    Path lmPath = zemberekDataRoot.resolve("lm/lm.2gram.slm");
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSentenceNormalizer normalizer = new TurkishSentenceNormalizer(morphology, lookupRoot, lmPath);
    for (String example : examples) {
        System.out.println(example);
        System.out.println(normalizer.normalize(example));
        System.out.println();
    }
}
Also used : Path(java.nio.file.Path) TurkishSentenceNormalizer(zemberek.normalization.TurkishSentenceNormalizer) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 50 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class AmbiguousExampleFinder method main.

public static void main(String[] args) throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Path indexRoot = Paths.get("/home/aaa/data/zemberek/corpus-index");
    CorpusSearcher searcher = new CorpusSearcher(indexRoot);
    AmbiguousExampleFinder finder = new AmbiguousExampleFinder(searcher);
    extractSentences(morphology, finder);
}
Also used : Path(java.nio.file.Path) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5