Search in sources :

Example 16 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class CompressedCharNgramModel method compress.

public static void compress(MapBasedCharNgramLanguageModel model, File output) throws IOException {
    Mphf[] mphfs = new MultiLevelMphf[model.getOrder() + 1];
    DoubleLookup[] lookups = new DoubleLookup[model.getOrder() + 1];
    try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)))) {
        dos.writeInt(model.getOrder());
        dos.writeUTF(model.getId());
        for (int i = 1; i <= model.getOrder(); i++) {
            Histogram<Double> histogram = new Histogram<>();
            histogram.add(model.gramLogProbs[i].values.values());
            double[] lookup = new double[histogram.size()];
            int j = 0;
            for (Double key : histogram) {
                lookup[j] = key;
                j++;
            }
            Quantizer quantizer = BinningQuantizer.linearBinning(lookup, 8);
            lookups[i] = quantizer.getDequantizer();
            List<String> keys = Lists.newArrayList(model.gramLogProbs[i].values.keySet());
            int[] fingerprints = new int[keys.size()];
            int[] probabilityIndexes = new int[keys.size()];
            mphfs[i] = MultiLevelMphf.generate(new StringListKeyProvider(keys));
            for (final String key : keys) {
                final int index = mphfs[i].get(key);
                fingerprints[index] = MultiLevelMphf.hash(key, -1) & FINGER_PRINT_MASK;
                probabilityIndexes[index] = quantizer.getQuantizationIndex(model.gramLogProbs[i].values.get(key));
            }
            lookups[i].save(dos);
            dos.writeInt(keys.size());
            for (int k = 0; k < keys.size(); k++) {
                dos.writeShort(fingerprints[k] & 0xffff);
                dos.writeByte(probabilityIndexes[k]);
            }
            mphfs[i].serialize(dos);
        }
    }
}
Also used : Histogram(zemberek.core.collections.Histogram) MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) Mphf(zemberek.core.hash.Mphf) DataOutputStream(java.io.DataOutputStream) MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) FileOutputStream(java.io.FileOutputStream) Quantizer(zemberek.core.quantization.Quantizer) BinningQuantizer(zemberek.core.quantization.BinningQuantizer) DoubleLookup(zemberek.core.quantization.DoubleLookup) BufferedOutputStream(java.io.BufferedOutputStream)

Example 17 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class ConfusionTest method testContains.

public void testContains() throws IOException {
    int sliceLength = 1000;
    int maxSliceCount = 1000;
    List<TestSet> sets = allSets(maxSliceCount, sliceLength);
    Set<String> languages = identifier.getLanguages();
    for (String language : languages) {
        System.out.println(language);
        Stopwatch sw = Stopwatch.createStarted();
        int falsePositives = 0;
        int totalCount = 0;
        int correctlyFound = 0;
        int correctAmount = 0;
        for (TestSet set : sets) {
            /*                if(!set.modelId.equals("tr"))
                    continue;*/
            totalCount += set.size();
            Histogram<String> result = new Histogram<>();
            for (String s : set.testPieces) {
                /*
                    LanguageIdentifier.IdResult idResult = identifier.identifyFullConf(s);
                    result.add(idResult.id);
*/
                // String t = identifier.identify(s, 100);
                // String t = identifier.identify(s);
                String t = "tr";
                identifier.containsLanguage(s, "tr", 100, -1);
                if (set.modelId.equals(language) && !t.equals(language)) {
                /* if (identifier.containsLanguage(s, "tr", 100, -1))
                            System.out.println("Has tr slice!");
                        System.out.println(t + " " + s);*/
                }
                result.add(t);
            // result.add(identifier.identifyWithSampling(s,sliceLength));
            // result.add(identifier.identifyWithSampling(s, 4));
            }
            if (set.modelId.equals(language)) {
                System.out.println("Lang test size:" + set.size());
                correctlyFound = result.getCount(language);
                correctAmount = set.size();
                List<String> sorted = result.getSortedList();
                for (String s : sorted) {
                    System.out.println(s + " : " + result.getCount(s));
                }
                continue;
            } else {
                int fpcount = result.getCount(language);
                if (fpcount > 0) {
                    System.out.println(set.modelId + " " + fpcount);
                }
            }
            falsePositives += result.getCount(language);
        }
        double elapsed = sw.elapsed(TimeUnit.MILLISECONDS);
        System.out.println(String.format(Locale.ENGLISH, "Id per second: %.2f", (1000d * totalCount / elapsed)));
        System.out.println("False positive count: " + falsePositives);
        System.out.println("All: " + totalCount);
        System.out.println(String.format(Locale.ENGLISH, "Precision:%.2f ", (100d * correctlyFound / correctAmount)));
        System.out.println(String.format(Locale.ENGLISH, "Recall: %.2f", (100d * (totalCount - falsePositives) / totalCount)));
    }
}
Also used : Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch)

Example 18 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class SpellCheckerPerformanceTests method correctWordFindingTest.

@Test
@Ignore(value = "Not a test.")
public void correctWordFindingTest() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
    TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
    TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
    Path path = new File(Resources.getResource("spell-checker-test.txt").getFile()).toPath();
    List<String> lines = Files.readAllLines(path);
    List<String> sentences = extractor.fromParagraphs(lines);
    Stopwatch sw = Stopwatch.createStarted();
    Histogram<String> incorrectFound = new Histogram<>();
    Histogram<String> correctFound = new Histogram<>();
    for (String sentence : sentences) {
        List<Token> tokens = tokenizer.tokenize(sentence);
        for (Token token : tokens) {
            String text = token.getText();
            if (!spellChecker.check(text)) {
                incorrectFound.add(text);
            } else {
                correctFound.add(text);
            }
        }
    }
    Log.info("Elapsed = %d", sw.elapsed(TimeUnit.MILLISECONDS));
    Log.info("Incorrect (total/unique) = %d / %d", incorrectFound.totalCount(), incorrectFound.size());
    Log.info("Correct (total/unique) = %d / %d", correctFound.totalCount(), correctFound.size());
    incorrectFound.saveSortedByCounts(Paths.get("incorrect.txt"), " : ");
    correctFound.saveSortedByCounts(Paths.get("correct.txt"), " : ");
/*
        Path lmPath = Paths.get(ClassLoader.getSystemResource("lm-bigram.slm").toURI());
        SmoothLm model = SmoothLm.builder(lmPath.toFile()).build();
*/
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) TurkishMorphology(zemberek.morphology.TurkishMorphology) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 19 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method ambiguousWordStats.

public void ambiguousWordStats(String filename) throws IOException {
    List<String> lines = readAll(filename);
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
    for (String line : lines) {
        for (String s : splitter.split(line)) {
            List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
            total++;
            if (total % 50000 == 0) {
                System.out.println("Processed: " + total);
            }
            if (results.size() > 1) {
                uniques.add(s);
            }
        }
    }
    System.out.println("Total: " + total);
    Stats st = new Stats(0.002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (st.overCutoff(count)) {
            String p1 = percentStr3(count, st.allCounts);
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count + "    " + pp(p1));
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) Splitter(com.google.common.base.Splitter) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 20 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method noParse.

public void noParse(String... filename) throws IOException {
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    for (String file : filename) {
        List<String> lines = readAll(file);
        Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
        for (String line : lines) {
            for (String s : splitter.split(line)) {
                List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
                total++;
                if (total % 50000 == 0) {
                    System.out.println("Processed: " + total);
                }
                if (results.size() == 0) {
                    uniques.add(s);
                }
            }
        }
        System.out.println("Total: " + total);
    }
    Stats st = new Stats(0.0002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (count > 5) {
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count);
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) Splitter(com.google.common.base.Splitter) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Aggregations

Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch (com.google.common.base.Stopwatch)11 PrintWriter (java.io.PrintWriter)11 LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException (java.io.IOException)10 Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors (java.util.stream.Collectors)10 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9