Search in sources :

Example 21 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class SpeedTest method testNewsCorpusNoCache.

@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpusNoCache() throws IOException {
    Path p = Paths.get("src/main/resources/corpora/cnn-turk-10k");
    List<String> sentences = getSentences(p);
    RootLexicon lexicon = TurkishDictionaryLoader.loadDefaultDictionaries();
    InterpretingAnalyzer analyzer = new InterpretingAnalyzer(lexicon);
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    Histogram<String> failedWords = new Histogram<>(100000);
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            if (token.getType() == TurkishLexer.Punctuation) {
                continue;
            }
            tokenCount++;
            List<_SingleAnalysis> results = analyzer.analyze(token.getText());
            if (results.size() == 0) {
                noAnalysis++;
                failedWords.add(token.getText());
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
    Log.info("Saving Unknown Tokens");
    failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
    failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(org.antlr.v4.runtime.Token) RootLexicon(zemberek.morphology.lexicon.RootLexicon) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 22 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AutomaticLabelingExperiment method generateSetForLabelExperiment.

Set<String> generateSetForLabelExperiment(Path input, TurkishSentenceAnalyzer analyzer, boolean useRoots) throws IOException {
    WebCorpus corpus = new WebCorpus("label", "labeled");
    corpus.addDocuments(WebCorpus.loadDocuments(input));
    List<String> set = new ArrayList<>(corpus.documentCount());
    Log.info("Extracting data.");
    Histogram<String> labelCounts = new Histogram<>();
    for (WebDocument document : corpus.getDocuments()) {
        List<String> labels = document.getLabels();
        List<String> lowerCase = labels.stream().filter(s -> s.length() > 1).map(s -> s.toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
        labelCounts.add(lowerCase);
    }
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-all"), " ");
    Log.info("All label count = %d", labelCounts.size());
    labelCounts.removeSmaller(15);
    Log.info("Reduced label count = %d", labelCounts.size());
    labelCounts.saveSortedByCounts(experimentRoot.resolve("labels-reduced"), " ");
    Log.info("Extracting data from %d documents ", corpus.documentCount());
    int c = 0;
    Set<Long> contentHash = new HashSet<>();
    for (WebDocument document : corpus.getDocuments()) {
        Long hash = document.getHash();
        if (contentHash.contains(hash)) {
            continue;
        }
        contentHash.add(hash);
        List<String> labelTags = new ArrayList<>();
        boolean labelFound = false;
        for (String label : document.getLabels()) {
            if (labelCounts.contains(label)) {
                labelTags.add("__label__" + label.replaceAll("[ ]+", "_").toLowerCase(Turkish.LOCALE));
                labelFound = true;
            }
        }
        if (!labelFound) {
            continue;
        }
        String labelStr = String.join(" ", labelTags);
        String content = document.getContentAsString();
        String processed = processContent(analyzer, content, useRoots);
        if (processed.length() < 200) {
            continue;
        }
        set.add("#" + document.getId() + " " + labelStr + " " + processed);
        if (c++ % 1000 == 0) {
            Log.info("%d processed.", c);
        }
    }
    Log.info("Generate train and test set.");
    Collections.shuffle(set, new Random(1));
    return new LinkedHashSet<>(set);
}
Also used : TurkishSentenceAnalyzer(zemberek.morphology.analysis.tr.TurkishSentenceAnalyzer) Stopwatch(com.google.common.base.Stopwatch) WebCorpus(zemberek.corpus.WebCorpus) Token(org.antlr.v4.runtime.Token) Random(java.util.Random) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) WebDocument(zemberek.corpus.WebDocument) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) Log(zemberek.core.logging.Log) Path(java.nio.file.Path) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) PrintWriter(java.io.PrintWriter) Files(java.nio.file.Files) Z3MarkovModelDisambiguator(zemberek.morphology.ambiguity.Z3MarkovModelDisambiguator) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) Paths(java.nio.file.Paths) ScoredItem(zemberek.core.ScoredItem) Comparator(java.util.Comparator) Collections(java.util.Collections) LinkedHashSet(java.util.LinkedHashSet) Histogram(zemberek.core.collections.Histogram) ArrayList(java.util.ArrayList) WebDocument(zemberek.corpus.WebDocument) Random(java.util.Random) WebCorpus(zemberek.corpus.WebCorpus) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 23 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class GenerateVocabulary method run.

@Override
public void run() throws Exception {
    if (!corpus.exists()) {
        throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
    }
    if (top < -1 || top == 0)
        throw new IllegalArgumentException("Illegal value for -top: " + top);
    Set<String> wordsToInclude = getWordsFromFile(includeFile);
    if (wordsToInclude.size() > 0) {
        Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
    }
    Set<String> wordsToExclude = getWordsFromFile(excludeFile);
    if (wordsToExclude.size() > 0) {
        Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
    }
    Set<String> intersection = Sets.newHashSet(wordsToExclude);
    intersection.retainAll(wordsToInclude);
    if (intersection.size() != 0) {
        Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
    }
    Collator collator = Collator.getInstance(Locale.ENGLISH);
    if (sortLocale != null) {
        collator = Collator.getInstance(new Locale(sortLocale));
    }
    Log.info("Processing corpus: %s", corpus);
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
        String line;
        Histogram<String> histogram = new Histogram<>(50000);
        int count = 0;
        while ((line = reader.readLine()) != null) {
            List<String> words = new ArrayList<>(10);
            for (String word : Splitter.on(" ").omitEmptyStrings().trimResults().split(line)) {
                if (word.length() > 30) {
                    Log.warn("Too long word %s", word);
                }
                if (!countMetaWords) {
                    if (word.contains("<") || word.equalsIgnoreCase(">")) {
                        continue;
                    }
                }
                words.add(word);
            }
            if (words.isEmpty())
                continue;
            histogram.add(words);
            if (count % 500000 == 0 && count != 0)
                Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
            count++;
        }
        Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
        if (minFreq > 1) {
            histogram.removeSmaller(minFreq);
        }
        if (top >= histogram.size() || top == -1) {
            top = histogram.size();
            Log.info("All %d words will be in the vocabulary.", top);
        } else
            Log.info("Top %d words will be used in the vocabulary.", top);
        List<String> mostFrequent;
        if (top > 0) {
            mostFrequent = histogram.getTop(top);
        } else {
            mostFrequent = histogram.getSortedList();
        }
        Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
        LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
        resultSet.addAll(wordsToInclude);
        resultSet.removeAll(wordsToExclude);
        List<String> result = Lists.newArrayList(resultSet);
        Log.info("Total size of vocabulary: %d", result.size());
        if (ordered) {
            Log.info("Sorting file with word order.");
            Collections.sort(result, collator);
        }
        com.google.common.io.Files.createParentDirs(outFile);
        Log.info("Saving to vocabulary file: %s", outFile);
        if (!writeFrequencies) {
            SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
        } else {
            Log.info("Frequency values will be written with words.");
            try (SimpleTextWriter stw = SimpleTextWriter.keepOpenUTF8Writer(outFile)) {
                for (String s : result) {
                    stw.writeLine(s + frequencyFileDelimiter + histogram.getCount(s));
                }
            }
        }
        Log.info("Done.");
    }
}
Also used : Histogram(zemberek.core.collections.Histogram) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) Collator(java.text.Collator) SimpleTextWriter(zemberek.core.io.SimpleTextWriter) BufferedReader(java.io.BufferedReader)

Example 24 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method ambiguousGroupStats.

public void ambiguousGroupStats(String filename) throws IOException {
    List<String> lines = readAll(filename);
    Histogram<String> uniques = new Histogram<>(1000000);
    Map<String, Histogram<String>> ambiguityGroups = Maps.newHashMap();
    int total = 0;
    for (String line : lines) {
        for (String s : splitter.split(line)) {
            WordAnalysis results = parser.analyze(s);
            if (++total % 50000 == 0) {
                System.out.println("Processed: " + total);
            }
            if (results.analysisCount() > 1) {
                String key = generateKeyFromParse(results);
                uniques.add(key);
                Histogram<String> members = ambiguityGroups.get(key);
                if (members == null) {
                    members = new Histogram<>();
                    ambiguityGroups.put(key, members);
                }
                members.add(s);
            }
        }
    }
    System.out.println("Total: " + total);
    Stats st = new Stats(0.1);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (st.overCutoff(count)) {
            String p1 = percentStr(count, st.allCounts);
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count + "    " + pp(p1));
            Histogram<String> members = ambiguityGroups.get(s);
            for (String member : members.getSortedList()) {
                int memberCount = members.getCount(member);
                if (pct(memberCount, count) > 0.1) {
                    System.out.println(member + " : " + members.getCount(member));
                }
            }
            System.out.println();
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 25 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class AmbiguityStats method noParse.

public void noParse(String... filename) throws IOException {
    Histogram<String> uniques = new Histogram<>(1000000);
    int total = 0;
    for (String file : filename) {
        List<String> lines = readAll(file);
        Splitter splitter = Splitter.on(" ").omitEmptyStrings().trimResults();
        for (String line : lines) {
            for (String s : splitter.split(line)) {
                WordAnalysis results = parser.analyze(s);
                total++;
                if (total % 50000 == 0) {
                    System.out.println("Processed: " + total);
                }
                if (results.analysisCount() == 0) {
                    uniques.add(s);
                }
            }
        }
        System.out.println("Total: " + total);
    }
    Stats st = new Stats(0.0002);
    st.allCounts = (int) uniques.totalCount();
    st.allUniques = uniques.size();
    for (String s : uniques.getSortedList()) {
        int count = uniques.getCount(s);
        if (count > 5) {
            st.significantCounts += count;
            st.significantUniques++;
            System.out.println(s + " : " + count);
        }
    }
    st.dump();
}
Also used : Histogram(zemberek.core.collections.Histogram) Splitter(com.google.common.base.Splitter) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Aggregations

Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch (com.google.common.base.Stopwatch)11 PrintWriter (java.io.PrintWriter)11 LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException (java.io.IOException)10 Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors (java.util.stream.Collectors)10 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9