Search in sources :

Example 16 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class CompressedCharNgramModel method compress.

public static void compress(MapBasedCharNgramLanguageModel model, File output) throws IOException {
    Mphf[] mphfs = new MultiLevelMphf[model.getOrder() + 1];
    DoubleLookup[] lookups = new DoubleLookup[model.getOrder() + 1];
    try (DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(output)))) {
        for (int i = 1; i <= model.getOrder(); i++) {
            Histogram<Double> histogram = new Histogram<>();
            double[] lookup = new double[histogram.size()];
            int j = 0;
            for (Double key : histogram) {
                lookup[j] = key;
            Quantizer quantizer = BinningQuantizer.linearBinning(lookup, 8);
            lookups[i] = quantizer.getDequantizer();
            List<String> keys = Lists.newArrayList(model.gramLogProbs[i].values.keySet());
            int[] fingerprints = new int[keys.size()];
            int[] probabilityIndexes = new int[keys.size()];
            mphfs[i] = MultiLevelMphf.generate(new StringListKeyProvider(keys));
            for (final String key : keys) {
                final int index = mphfs[i].get(key);
                fingerprints[index] = MultiLevelMphf.hash(key, -1) & FINGER_PRINT_MASK;
                probabilityIndexes[index] = quantizer.getQuantizationIndex(model.gramLogProbs[i].values.get(key));
            for (int k = 0; k < keys.size(); k++) {
                dos.writeShort(fingerprints[k] & 0xffff);
Also used : Histogram(zemberek.core.collections.Histogram) MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) Mphf(zemberek.core.hash.Mphf) DataOutputStream( MultiLevelMphf(zemberek.core.hash.MultiLevelMphf) FileOutputStream( Quantizer(zemberek.core.quantization.Quantizer) BinningQuantizer(zemberek.core.quantization.BinningQuantizer) DoubleLookup(zemberek.core.quantization.DoubleLookup) BufferedOutputStream(

Example 17 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class ConfusionTest method testContains.

public void testContains() throws IOException {
    int sliceLength = 1000;
    int maxSliceCount = 1000;
    List<TestSet> sets = allSets(maxSliceCount, sliceLength);
    Set<String> languages = identifier.getLanguages();
    for (String language : languages) {
        Stopwatch sw = Stopwatch.createStarted();
        int falsePositives = 0;
        int totalCount = 0;
        int correctlyFound = 0;
        int correctAmount = 0;
        for (TestSet set : sets) {
            /*                if(!set.modelId.equals("tr"))
            totalCount += set.size();
            Histogram<String> result = new Histogram<>();
            for (String s : set.testPieces) {
                    LanguageIdentifier.IdResult idResult = identifier.identifyFullConf(s);
                // String t = identifier.identify(s, 100);
                // String t = identifier.identify(s);
                String t = "tr";
                identifier.containsLanguage(s, "tr", 100, -1);
                if (set.modelId.equals(language) && !t.equals(language)) {
                /* if (identifier.containsLanguage(s, "tr", 100, -1))
                            System.out.println("Has tr slice!");
                        System.out.println(t + " " + s);*/
            // result.add(identifier.identifyWithSampling(s,sliceLength));
            // result.add(identifier.identifyWithSampling(s, 4));
            if (set.modelId.equals(language)) {
                System.out.println("Lang test size:" + set.size());
                correctlyFound = result.getCount(language);
                correctAmount = set.size();
                List<String> sorted = result.getSortedList();
                for (String s : sorted) {
                    System.out.println(s + " : " + result.getCount(s));
            } else {
                int fpcount = result.getCount(language);
                if (fpcount > 0) {
                    System.out.println(set.modelId + " " + fpcount);
            falsePositives += result.getCount(language);
        double elapsed = sw.elapsed(TimeUnit.MILLISECONDS);
        System.out.println(String.format(Locale.ENGLISH, "Id per second: %.2f", (1000d * totalCount / elapsed)));
        System.out.println("False positive count: " + falsePositives);
        System.out.println("All: " + totalCount);
        System.out.println(String.format(Locale.ENGLISH, "Precision:%.2f ", (100d * correctlyFound / correctAmount)));
        System.out.println(String.format(Locale.ENGLISH, "Recall: %.2f", (100d * (totalCount - falsePositives) / totalCount)));
Also used : Histogram(zemberek.core.collections.Histogram) Stopwatch(

Example 18 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class SpellCheckerPerformanceTests method correctWordFindingTest.

@Ignore(value = "Not a test.")
public void correctWordFindingTest() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
    TurkishSentenceExtractor extractor = TurkishSentenceExtractor.DEFAULT;
    TurkishTokenizer tokenizer = TurkishTokenizer.DEFAULT;
    Path path = new File(Resources.getResource("spell-checker-test.txt").getFile()).toPath();
    List<String> lines = Files.readAllLines(path);
    List<String> sentences = extractor.fromParagraphs(lines);
    Stopwatch sw = Stopwatch.createStarted();
    Histogram<String> incorrectFound = new Histogram<>();
    Histogram<String> correctFound = new Histogram<>();
    for (String sentence : sentences) {
        List<Token> tokens = tokenizer.tokenize(sentence);
        for (Token token : tokens) {
            String text = token.getText();
            if (!spellChecker.check(text)) {
            } else {
    }"Elapsed = %d", sw.elapsed(TimeUnit.MILLISECONDS));"Incorrect (total/unique) = %d / %d", incorrectFound.totalCount(), incorrectFound.size());"Correct (total/unique) = %d / %d", correctFound.totalCount(), correctFound.size());
    incorrectFound.saveSortedByCounts(Paths.get("incorrect.txt"), " : ");
    correctFound.saveSortedByCounts(Paths.get("correct.txt"), " : ");
        Path lmPath = Paths.get(ClassLoader.getSystemResource("lm-bigram.slm").toURI());
        SmoothLm model = SmoothLm.builder(lmPath.toFile()).build();
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch( Token(zemberek.tokenization.Token) TurkishMorphology(zemberek.morphology.TurkishMorphology) TurkishSentenceExtractor(zemberek.tokenization.TurkishSentenceExtractor) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) File( Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 19 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method ambiguousWordStats.

public void ambiguousWordStats(String filename) throws IOException {
    List<String> lines = readAll(filename);
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
    for (String line : lines) {
        for (String s : splitter.split(line)) {
            List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
            if (total % 50000 == 0) {
                System.out.println("Processed: " + total);
            if (results.size() > 1) {
    System.out.println("Total: " + total);
    Stats st = new Stats(0.002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (st.overCutoff(count)) {
            String p1 = percentStr3(count, st.allCounts);
            st.significantCounts += count;
            System.out.println(s + " : " + count + "    " + pp(p1));
Also used : Histogram(zemberek.core.collections.Histogram) Splitter( WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 20 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method noParse.

public void noParse(String... filename) throws IOException {
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    for (String file : filename) {
        List<String> lines = readAll(file);
        Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
        for (String line : lines) {
            for (String s : splitter.split(line)) {
                List<WordAnalysis> results = parser.getWordAnalyzer().analyze(TurkishAlphabet.INSTANCE.normalize(s));
                if (total % 50000 == 0) {
                    System.out.println("Processed: " + total);
                if (results.size() == 0) {
        System.out.println("Total: " + total);
    Stats st = new Stats(0.0002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (count > 5) {
            st.significantCounts += count;
            System.out.println(s + " : " + count);
Also used : Histogram(zemberek.core.collections.Histogram) Splitter( WordAnalysis(zemberek.morphology.analysis.WordAnalysis)


Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch ( PrintWriter ( LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException ( Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors ( TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9