Search in sources :

Example 41 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class MorphologyConsole method run.

@Override
public void run() {
    Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault());
    if (disableUnknownAnalysis) {
        b.disableUnidentifiedTokenAnalyzer();
    }
    if (enableInformalWordAnalysis) {
        b.useInformalAnalysis();
    }
    TurkishMorphology morphology = b.build();
    String input;
    System.out.println("Enter word or sentence. Type `quit` or `Ctrl+C` to exit.:");
    Scanner sc = new Scanner(System.in);
    input = sc.nextLine();
    while (!input.equals("quit")) {
        if (input.trim().length() == 0) {
            System.out.println("Empty line cannot be processed.");
            input = sc.nextLine();
            continue;
        }
        SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
        System.out.format("%nS:%s%n", input);
        for (SentenceWordAnalysis sw : analysis) {
            WordAnalysis wa = sw.getWordAnalysis();
            System.out.println(wa.getInput());
            SingleAnalysis best = sw.getBestAnalysis();
            for (SingleAnalysis singleAnalysis : wa) {
                boolean isBest = singleAnalysis.equals(best);
                if (wa.analysisCount() == 1) {
                    System.out.println(singleAnalysis.formatLong());
                } else {
                    System.out.format("%s%s%n", singleAnalysis.formatLong(), isBest ? "*" : "");
                }
            }
        }
        System.out.println();
        input = sc.nextLine();
    }
}
Also used : Scanner(java.util.Scanner) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Builder(zemberek.morphology.TurkishMorphology.Builder) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology) SentenceWordAnalysis(zemberek.morphology.analysis.SentenceWordAnalysis)

Example 42 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method splitWords.

static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
    Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
    SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
    Log.info("Language model = %s", lm.info());
    Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
    wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
    Log.info("%d words loaded.", wordFreq.size());
    wordFreq.removeSmaller(minWordCount);
    if (minWordCount > 1) {
        Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
    }
    int unkIndex = lm.getVocabulary().getUnknownWordIndex();
    try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
        PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
        for (String word : wordFreq.getSortedList()) {
            if (asciiMapKeys.contains(word)) {
                continue;
            }
            if (word.length() < 5 || word.contains("-")) {
                continue;
            }
            List<ScoredItem<String>> k = new ArrayList<>();
            for (int i = 1; i < word.length() - 1; i++) {
                String head = word.substring(0, i);
                String tail = word.substring(i);
                if (noSplitTails.contains(tail)) {
                    continue;
                }
                int hi = lm.getVocabulary().indexOf(head);
                int ti = lm.getVocabulary().indexOf(tail);
                if (hi == unkIndex || ti == unkIndex) {
                    continue;
                }
                if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
                    continue;
                }
                if (lm.ngramExists(hi, ti)) {
                    k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
                }
            }
            if (k.size() > 1) {
                k.sort((a, b) -> Double.compare(b.score, a.score));
            }
            if (k.size() > 0) {
                ScoredItem<String> best = k.get(0);
                if (best.score > -7) {
                    pw.println(word + " = " + best.item);
                    pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
                }
            }
        }
    }
}
Also used : TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) TextUtil(zemberek.core.text.TextUtil) Callable(java.util.concurrent.Callable) CompletionService(java.util.concurrent.CompletionService) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Token(zemberek.tokenization.Token) HashMultimap(com.google.common.collect.HashMultimap) Charset(java.nio.charset.Charset) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) ExecutorService(java.util.concurrent.ExecutorService) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Charsets(com.google.common.base.Charsets) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Deasciifier(zemberek.normalization.deasciifier.Deasciifier) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) LanguageIdentifier(zemberek.langid.LanguageIdentifier) SmoothLm(zemberek.lm.compression.SmoothLm) FixedBitVector(zemberek.core.collections.FixedBitVector) ScoredItem(zemberek.core.ScoredItem) RootLexicon(zemberek.morphology.lexicon.RootLexicon) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) BlockTextLoader(zemberek.core.text.BlockTextLoader) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SmoothLm(zemberek.lm.compression.SmoothLm) PrintWriter(java.io.PrintWriter)

Example 43 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method cleanTwitterData.

static void cleanTwitterData(Path in, Path out) throws Exception {
    AnalysisCache cache = AnalysisCache.builder().dynamicCacheSize(300_000, 500_000).build();
    TurkishMorphology morphology = TurkishMorphology.builder().setCache(cache).setLexicon(RootLexicon.getDefault()).disableUnidentifiedTokenAnalyzer().build();
    int threadCount = Runtime.getRuntime().availableProcessors() / 2;
    if (threadCount > 20) {
        threadCount = 20;
    }
    ExecutorService executorService = new BlockingExecutor(threadCount);
    CompletionService<TwitterSaver> service = new ExecutorCompletionService<>(executorService);
    int blockSize = 20_000;
    BlockTextLoader loader = BlockTextLoader.fromPath(in, blockSize);
    Path foreign = Paths.get(out.toString() + ".foreign");
    TwitterSaver saver = new TwitterSaver(out, foreign, blockSize);
    int bc = 0;
    for (TextChunk block : loader) {
        service.submit(new TwitterTask(morphology, saver, block, bc));
        bc++;
    }
    executorService.shutdown();
    executorService.awaitTermination(1, TimeUnit.DAYS);
}
Also used : AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Path(java.nio.file.Path) BlockTextLoader(zemberek.core.text.BlockTextLoader) ExecutorService(java.util.concurrent.ExecutorService) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) TextChunk(zemberek.core.text.TextChunk) TurkishMorphology(zemberek.morphology.TurkishMorphology) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor)

Example 44 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class AnalyzeAndConvertInformalWords method main.

public static void main(String[] args) {
    TurkishMorphology morphology = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()).useInformalAnalysis().build();
    List<SingleAnalysis> analyses = morphology.analyzeAndDisambiguate("okuycam diyo").bestAnalysis();
    for (SingleAnalysis a : analyses) {
        System.out.println(a.surfaceForm() + "-" + a);
    }
    System.out.println("Converting formal surface form:");
    InformalAnalysisConverter converter = new InformalAnalysisConverter(morphology.getWordGenerator());
    for (SingleAnalysis a : analyses) {
        System.out.println(converter.convert(a.surfaceForm(), a));
    }
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) InformalAnalysisConverter(zemberek.morphology.analysis.InformalAnalysisConverter) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Example 45 with TurkishMorphology

use of zemberek.morphology.TurkishMorphology in project zemberek-nlp by ahmetaa.

the class DisambiguateSentences method main.

public static void main(String[] args) {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    String sentence = "Bol baharatlı bir yemek yaptıralım.";
    Log.info("Sentence  = " + sentence);
    List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);
    Log.info("Sentence word analysis result:");
    for (WordAnalysis entry : analyses) {
        Log.info("Word = " + entry.getInput());
        for (SingleAnalysis analysis : entry) {
            Log.info(analysis.formatLong());
        }
    }
    SentenceAnalysis result = morphology.disambiguate(sentence, analyses);
    Log.info("\nAfter ambiguity resolution : ");
    result.bestAnalysis().forEach(Log::info);
}
Also used : SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Log(zemberek.core.logging.Log) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) TurkishMorphology(zemberek.morphology.TurkishMorphology)

Aggregations

TurkishMorphology (zemberek.morphology.TurkishMorphology)87 Test (org.junit.Test)38 Path (java.nio.file.Path)34 ArrayList (java.util.ArrayList)23 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)23 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)23 Ignore (org.junit.Ignore)21 DictionaryItem (zemberek.morphology.lexicon.DictionaryItem)15 LinkedHashSet (java.util.LinkedHashSet)13 PrintWriter (java.io.PrintWriter)10 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)10 Stopwatch (com.google.common.base.Stopwatch)8 Histogram (zemberek.core.collections.Histogram)8 Token (zemberek.tokenization.Token)8 HashSet (java.util.HashSet)7 SentenceWordAnalysis (zemberek.morphology.analysis.SentenceWordAnalysis)7 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)7 ScoredItem (zemberek.core.ScoredItem)6 IOException (java.io.IOException)5 BlockTextLoader (zemberek.core.text.BlockTextLoader)5