Search in sources :

Example 11 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class FastText method predict.

public List<ScoredItem<String>> predict(String line, int k, float threshold) {
    IntVector words = new IntVector();
    IntVector labels = new IntVector();
    dict_.getLine(line, words, labels);
    if (words.isempty()) {
        return Collections.emptyList();
    }
    List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
    List<ScoredItem<String>> result = new ArrayList<>(modelPredictions.size());
    for (Model.FloatIntPair pair : modelPredictions) {
        result.add(new ScoredItem<>(dict_.getLabel(pair.second), pair.first));
    }
    return result;
}
Also used : IntVector(zemberek.core.collections.IntVector) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList)

Example 12 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method splitWords.

static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
    Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
    SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
    Log.info("Language model = %s", lm.info());
    Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
    wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
    Log.info("%d words loaded.", wordFreq.size());
    wordFreq.removeSmaller(minWordCount);
    if (minWordCount > 1) {
        Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
    }
    int unkIndex = lm.getVocabulary().getUnknownWordIndex();
    try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
        PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
        for (String word : wordFreq.getSortedList()) {
            if (asciiMapKeys.contains(word)) {
                continue;
            }
            if (word.length() < 5 || word.contains("-")) {
                continue;
            }
            List<ScoredItem<String>> k = new ArrayList<>();
            for (int i = 1; i < word.length() - 1; i++) {
                String head = word.substring(0, i);
                String tail = word.substring(i);
                if (noSplitTails.contains(tail)) {
                    continue;
                }
                int hi = lm.getVocabulary().indexOf(head);
                int ti = lm.getVocabulary().indexOf(tail);
                if (hi == unkIndex || ti == unkIndex) {
                    continue;
                }
                if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
                    continue;
                }
                if (lm.ngramExists(hi, ti)) {
                    k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
                }
            }
            if (k.size() > 1) {
                k.sort((a, b) -> Double.compare(b.score, a.score));
            }
            if (k.size() > 0) {
                ScoredItem<String> best = k.get(0);
                if (best.score > -7) {
                    pw.println(word + " = " + best.item);
                    pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
                }
            }
        }
    }
}
Also used : TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) TextUtil(zemberek.core.text.TextUtil) Callable(java.util.concurrent.Callable) CompletionService(java.util.concurrent.CompletionService) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Token(zemberek.tokenization.Token) HashMultimap(com.google.common.collect.HashMultimap) Charset(java.nio.charset.Charset) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) ExecutorService(java.util.concurrent.ExecutorService) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Charsets(com.google.common.base.Charsets) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Deasciifier(zemberek.normalization.deasciifier.Deasciifier) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) LanguageIdentifier(zemberek.langid.LanguageIdentifier) SmoothLm(zemberek.lm.compression.SmoothLm) FixedBitVector(zemberek.core.collections.FixedBitVector) ScoredItem(zemberek.core.ScoredItem) RootLexicon(zemberek.morphology.lexicon.RootLexicon) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) BlockTextLoader(zemberek.core.text.BlockTextLoader) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SmoothLm(zemberek.lm.compression.SmoothLm) PrintWriter(java.io.PrintWriter)

Example 13 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class ItemFindExperiment method testGrams.

static void testGrams(Path root, Path test, String labelTarget, String name) throws IOException {
    Path modelPath = root.resolve(name + ".model");
    Path predictions = root.resolve(name + ".predictions");
    FastTextClassifier classifier = FastTextClassifier.load(modelPath);
    try (PrintWriter pw = new PrintWriter(predictions.toFile(), "utf-8")) {
        List<String> all = Files.readAllLines(test);
        for (String s : all) {
            List<String> tokens = Splitter.on(" ").splitToList(s);
            List<String> rest = tokens.subList(1, tokens.size());
            List<String> grams = getGrams(rest, 7);
            List<Hit> hits = new ArrayList<>();
            for (String gram : grams) {
                List<ScoredItem<String>> res = classifier.predict(gram, 2);
                for (ScoredItem<String> re : res) {
                    float p = (float) Math.exp(re.score);
                    if (re.item.equals(labelTarget) && p > 0.45) {
                        hits.add(new Hit(gram, re));
                    }
                }
            }
            pw.println(s);
            for (Hit hit : hits) {
                pw.println(hit);
            }
            pw.println("-----------------------");
        }
    }
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) ScoredItem(zemberek.core.ScoredItem) PrintWriter(java.io.PrintWriter)

Example 14 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class TurkishSpellChecker method suggestForWord.

public List<String> suggestForWord(String word, String leftContext, String rightContext, NgramLanguageModel lm) {
    List<String> unRanked = getUnrankedSuggestions(word);
    if (lm == null) {
        Log.warn("No language model provided. Returning unraked results.");
        return unRanked;
    }
    if (lm.getOrder() < 2) {
        Log.warn("Language model order is 1. For context ranking it should be at least 2. " + "Unigram ranking will be applied.");
        return suggestForWord(word, lm);
    }
    LmVocabulary vocabulary = lm.getVocabulary();
    List<ScoredItem<String>> results = new ArrayList<>(unRanked.size());
    for (String str : unRanked) {
        if (leftContext == null) {
            leftContext = vocabulary.getSentenceStart();
        } else {
            leftContext = normalizeForLm(leftContext);
        }
        if (rightContext == null) {
            rightContext = vocabulary.getSentenceEnd();
        } else {
            rightContext = normalizeForLm(rightContext);
        }
        String w = normalizeForLm(str);
        int wordIndex = vocabulary.indexOf(w);
        int leftIndex = vocabulary.indexOf(leftContext);
        int rightIndex = vocabulary.indexOf(rightContext);
        float score;
        if (lm.getOrder() == 2) {
            score = lm.getProbability(leftIndex, wordIndex) + lm.getProbability(wordIndex, rightIndex);
        } else {
            score = lm.getProbability(leftIndex, wordIndex, rightIndex);
        }
        results.add(new ScoredItem<>(str, score));
    }
    results.sort(ScoredItem.STRING_COMP_DESCENDING);
    return results.stream().map(s -> s.item).collect(Collectors.toList());
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Resources(com.google.common.io.Resources) Predicate(java.util.function.Predicate) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) WordAnalysisSurfaceFormatter(zemberek.morphology.analysis.WordAnalysisSurfaceFormatter) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) Turkish(zemberek.core.turkish.Turkish) List(java.util.List) Token(zemberek.tokenization.Token) DummyLanguageModel(zemberek.lm.DummyLanguageModel) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) CharMatcher(zemberek.normalization.CharacterGraphDecoder.CharMatcher) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) SmoothLm(zemberek.lm.compression.SmoothLm) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) ScoredItem(zemberek.core.ScoredItem) Log(zemberek.core.logging.Log) NgramLanguageModel(zemberek.lm.NgramLanguageModel) LinkedHashSet(java.util.LinkedHashSet) InputStream(java.io.InputStream) LmVocabulary(zemberek.lm.LmVocabulary) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList)

Example 15 with ScoredItem

use of zemberek.core.ScoredItem in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method test.

private void test(Path corpusPath, Path testData, Path predictionPath, FastText fastText) throws IOException {
    WebCorpus corpus = new WebCorpus("label", "label");
    corpus.addDocuments(WebCorpus.loadDocuments(corpusPath));
    Log.info("Testing started.");
    List<String> testLines = Files.readAllLines(testData, StandardCharsets.UTF_8);
    Stopwatch sw = Stopwatch.createStarted();
    try (PrintWriter pw = new PrintWriter(predictionPath.toFile(), "utf-8")) {
        for (String testLine : testLines) {
            String id = testLine.substring(0, testLine.indexOf(' ')).substring(1);
            WebDocument doc = corpus.getDocument(id);
            List<ScoredItem<String>> res = fastText.predict(testLine, 7);
            List<String> predictedLabels = new ArrayList<>();
            for (ScoredItem<String> re : res) {
                predictedLabels.add(String.format(Locale.ENGLISH, "%s (%.2f)", re.item.replaceAll("__label__", "").replaceAll("_", " "), re.score));
            }
            pw.println("id = " + id);
            pw.println();
            pw.println(doc.getContentAsString().replaceAll("[\n\r]+", "\n"));
            pw.println();
            pw.println("Actual Labels = " + String.join(", ", doc.getLabels()));
            pw.println("Predictions   = " + String.join(", ", predictedLabels));
            pw.println();
            pw.println("------------------------------------------------------");
            pw.println();
        }
    }
    Log.info("Done. in %d ms.", sw.elapsed(TimeUnit.MILLISECONDS));
}
Also used : WebDocument(zemberek.corpus.WebDocument) Stopwatch(com.google.common.base.Stopwatch) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) WebCorpus(zemberek.corpus.WebCorpus) PrintWriter(java.io.PrintWriter)

Aggregations

ScoredItem (zemberek.core.ScoredItem)15 ArrayList (java.util.ArrayList)11 PrintWriter (java.io.PrintWriter)6 TurkishMorphology (zemberek.morphology.TurkishMorphology)6 Path (java.nio.file.Path)5 IOException (java.io.IOException)3 LinkedHashSet (java.util.LinkedHashSet)3 List (java.util.List)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 Test (org.junit.Test)3 FastTextClassifier (zemberek.classification.FastTextClassifier)3 Log (zemberek.core.logging.Log)3 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)3 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)3 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)3 Paths (java.nio.file.Paths)2 Scanner (java.util.Scanner)2 IntVector (zemberek.core.collections.IntVector)2 FastText (zemberek.core.embeddings.FastText)2