Search in sources :

Example 1 with SmoothLm

use of zemberek.lm.compression.SmoothLm in project zemberek-nlp by ahmetaa.

the class ConvertToSmoothLmTest method testConversion.

@Test
public void testConversion() throws IOException {
    File arpaFile = getTinyArpaFile();
    File sm = new File(System.currentTimeMillis() + "-test-lm.smooth");
    sm.deleteOnExit();
    new CompressLm().execute("-arpaFile", arpaFile.getAbsolutePath(), "-smoothFile", sm.getAbsolutePath(), "-spaceUsage", "16-16-16");
    Assert.assertTrue(sm.exists());
    SmoothLm lm = SmoothLm.builder(sm).build();
    Assert.assertEquals(3, lm.getOrder());
}
Also used : SmoothLm(zemberek.lm.compression.SmoothLm) File(java.io.File) Test(org.junit.Test)

Example 2 with SmoothLm

use of zemberek.lm.compression.SmoothLm in project zemberek-nlp by ahmetaa.

the class CompressLmTest method testConversion.

@Test
public void testConversion() throws IOException {
    File arpaFile = getTinyArpaFile();
    File sm = new File(System.currentTimeMillis() + "-test-lm.smooth");
    sm.deleteOnExit();
    new CompressLm().execute("-in", arpaFile.getAbsolutePath(), "-out", sm.getAbsolutePath(), "-spaceUsage", "16-16-16");
    Assert.assertTrue(sm.exists());
    SmoothLm lm = SmoothLm.builder(sm).build();
    Assert.assertEquals(3, lm.getOrder());
}
Also used : SmoothLm(zemberek.lm.compression.SmoothLm) File(java.io.File) Test(org.junit.Test)

Example 3 with SmoothLm

use of zemberek.lm.compression.SmoothLm in project zemberek-nlp by ahmetaa.

the class NormalizationScripts method splitWords.

static void splitWords(Path noisyVocab, Path cleanVocab, Path splitFile, Path lmPath, Path asciiMapPath, TurkishMorphology morphology, int minWordCount) throws IOException {
    Set<String> asciiMapKeys = Files.readAllLines(asciiMapPath).stream().map(s -> s.substring(0, s.indexOf('='))).collect(Collectors.toSet());
    SmoothLm lm = SmoothLm.builder(lmPath).logBase(Math.E).build();
    Log.info("Language model = %s", lm.info());
    Histogram<String> wordFreq = Histogram.loadFromUtf8File(noisyVocab.resolve("incorrect"), ' ');
    wordFreq.add(Histogram.loadFromUtf8File(cleanVocab.resolve("incorrect"), ' '));
    Log.info("%d words loaded.", wordFreq.size());
    wordFreq.removeSmaller(minWordCount);
    if (minWordCount > 1) {
        Log.info("%d words left after removing counts less than %d.", wordFreq.size(), minWordCount);
    }
    int unkIndex = lm.getVocabulary().getUnknownWordIndex();
    try (PrintWriter pw = new PrintWriter(splitFile.toFile(), "utf-8");
        PrintWriter pwFreq = new PrintWriter(splitFile.toFile().getAbsolutePath() + "freq", "utf-8")) {
        for (String word : wordFreq.getSortedList()) {
            if (asciiMapKeys.contains(word)) {
                continue;
            }
            if (word.length() < 5 || word.contains("-")) {
                continue;
            }
            List<ScoredItem<String>> k = new ArrayList<>();
            for (int i = 1; i < word.length() - 1; i++) {
                String head = word.substring(0, i);
                String tail = word.substring(i);
                if (noSplitTails.contains(tail)) {
                    continue;
                }
                int hi = lm.getVocabulary().indexOf(head);
                int ti = lm.getVocabulary().indexOf(tail);
                if (hi == unkIndex || ti == unkIndex) {
                    continue;
                }
                if ((tail.equals("de") || tail.equals("da")) && morphology.analyze(head).isCorrect()) {
                    continue;
                }
                if (lm.ngramExists(hi, ti)) {
                    k.add(new ScoredItem<>(head + " " + tail, lm.getProbability(hi, ti)));
                }
            }
            if (k.size() > 1) {
                k.sort((a, b) -> Double.compare(b.score, a.score));
            }
            if (k.size() > 0) {
                ScoredItem<String> best = k.get(0);
                if (best.score > -7) {
                    pw.println(word + " = " + best.item);
                    pwFreq.println(word + " = " + best.item + " " + wordFreq.getCount(word));
                }
            }
        }
    }
}
Also used : TurkishMorphotactics(zemberek.morphology.morphotactics.TurkishMorphotactics) TextUtil(zemberek.core.text.TextUtil) Callable(java.util.concurrent.Callable) CompletionService(java.util.concurrent.CompletionService) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BlockingExecutor(zemberek.core.concurrency.BlockingExecutor) Token(zemberek.tokenization.Token) HashMultimap(com.google.common.collect.HashMultimap) Charset(java.nio.charset.Charset) SingleAnalysis(zemberek.morphology.analysis.SingleAnalysis) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) AnalysisCache(zemberek.morphology.analysis.AnalysisCache) Log(zemberek.core.logging.Log) TextChunk(zemberek.core.text.TextChunk) Splitter(com.google.common.base.Splitter) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) ExecutorService(java.util.concurrent.ExecutorService) Histogram(zemberek.core.collections.Histogram) SecondaryPos(zemberek.core.turkish.SecondaryPos) PrintWriter(java.io.PrintWriter) Charsets(com.google.common.base.Charsets) Files(java.nio.file.Files) TurkishMorphology(zemberek.morphology.TurkishMorphology) Set(java.util.Set) IOException(java.io.IOException) Deasciifier(zemberek.normalization.deasciifier.Deasciifier) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Paths(java.nio.file.Paths) TextIO(zemberek.core.text.TextIO) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) LanguageIdentifier(zemberek.langid.LanguageIdentifier) SmoothLm(zemberek.lm.compression.SmoothLm) FixedBitVector(zemberek.core.collections.FixedBitVector) ScoredItem(zemberek.core.ScoredItem) RootLexicon(zemberek.morphology.lexicon.RootLexicon) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) BlockTextLoader(zemberek.core.text.BlockTextLoader) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) SmoothLm(zemberek.lm.compression.SmoothLm) PrintWriter(java.io.PrintWriter)

Aggregations

SmoothLm (zemberek.lm.compression.SmoothLm)3 File (java.io.File)2 Test (org.junit.Test)2 Charsets (com.google.common.base.Charsets)1 Splitter (com.google.common.base.Splitter)1 HashMultimap (com.google.common.collect.HashMultimap)1 IOException (java.io.IOException)1 PrintWriter (java.io.PrintWriter)1 Charset (java.nio.charset.Charset)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Files (java.nio.file.Files)1 Path (java.nio.file.Path)1 Paths (java.nio.file.Paths)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Set (java.util.Set)1 Callable (java.util.concurrent.Callable)1 CompletionService (java.util.concurrent.CompletionService)1