Search in sources :

Example 1 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class UnsupervisedKeyPhraseExtractor method collectCorpusStatisticsForLemmas.

static CorpusStatistics collectCorpusStatisticsForLemmas(WebCorpus corpus, TurkishSentenceAnalyzer analyzer, int count) throws IOException {
    CorpusStatistics statistics = new CorpusStatistics(1_000_000);
    int docCount = 0;
    for (WebDocument document : corpus.getDocuments()) {
        Histogram<String> docHistogram = new Histogram<>();
        List<String> sentences = extractor.fromParagraphs(document.getLines());
        for (String sentence : sentences) {
            List<WordAnalysis> analysis = analyzer.bestParse(sentence);
            for (WordAnalysis w : analysis) {
                if (!analysisAcceptable(w)) {
                    continue;
                }
                String s = w.getSurfaceForm();
                if (TurkishStopWords.DEFAULT.contains(s)) {
                    continue;
                }
                List<String> lemmas = w.getLemmas();
                docHistogram.add(lemmas.get(lemmas.size() - 1));
            }
        }
        statistics.termFrequencies.add(docHistogram);
        for (String s : docHistogram) {
            statistics.documentFrequencies.add(s);
        }
        if (docCount++ % 500 == 0) {
            Log.info("Doc count = %d", docCount);
        }
        if (count > 0 && docCount > count) {
            break;
        }
    }
    statistics.documentCount = count > 0 ? Math.min(count, corpus.documentCount()) : corpus.documentCount();
    return statistics;
}
Also used : Histogram(zemberek.core.collections.Histogram) WebDocument(zemberek.corpus.WebDocument) WordAnalysis(zemberek.morphology.analysis.WordAnalysis)

Example 2 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class GenerateVocabulary method run.

@Override
protected void run() throws Exception {
    if (!corpus.exists()) {
        throw new IllegalArgumentException("Can not find the corpus file: " + corpus);
    }
    if (top < -1 || top == 0) {
        throw new IllegalArgumentException("Illegal value for n: " + top);
    }
    Set<String> wordsToInclude = getWordsFromFile(includeFile);
    Log.info("Amount of words to include using include file: %d", wordsToInclude.size());
    Set<String> wordsToExclude = getWordsFromFile(excludeFile);
    Log.info("Amount of words to exclude using exclude file: %d", wordsToExclude.size());
    Set<String> intersection = Sets.newHashSet(wordsToExclude);
    intersection.retainAll(wordsToInclude);
    if (intersection.size() != 0) {
        Log.warn("There are matching words in both include and exclude files: " + intersection.toString());
    }
    Collator collator = Collator.getInstance(Locale.ENGLISH);
    if (sortLocale != null) {
        collator = Collator.getInstance(new Locale(sortLocale));
    }
    Log.info("Processing corpus: %s", corpus);
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(corpus), "utf-8"))) {
        String line;
        Histogram<String> histogram = new Histogram<>(50000);
        SpaceTabTokenizer tokenizer = new SpaceTabTokenizer();
        int count = 0;
        while ((line = reader.readLine()) != null) {
            List<String> words = Lists.newArrayList(tokenizer.split(line));
            if (words.isEmpty()) {
                continue;
            }
            histogram.add(words);
            if (count % 500000 == 0 && count != 0) {
                Log.info("%d lines processed. Vocabulary Size: %d", count, histogram.size());
            }
            count++;
        }
        Log.info("A total of %d lines have been processed. Vocabulary Size: %d", count, histogram.size());
        if (top >= histogram.size()) {
            top = histogram.size();
        } else {
            Log.info("Top %d words will be used.", top);
        }
        List<String> mostFrequent = histogram.getTop(top);
        Log.info("Coverage: %.3f", 100d * ((double) histogram.totalCount(mostFrequent)) / histogram.totalCount());
        LinkedHashSet<String> resultSet = Sets.newLinkedHashSet(mostFrequent);
        resultSet.addAll(wordsToInclude);
        resultSet.removeAll(wordsToExclude);
        List<String> result = Lists.newArrayList(resultSet);
        Log.info("Total size of vocabulary: %d", result.size());
        if (ordered) {
            Log.info("Sorting file with word order.");
            Collections.sort(result, collator);
        }
        com.google.common.io.Files.createParentDirs(outFile);
        Log.info("Saving to vocabulary file: %s", outFile);
        SimpleTextWriter.utf8Builder(outFile).addNewLineBeforClose().build().writeLines(result);
        Log.info("Done.");
    }
}
Also used : Locale(java.util.Locale) Histogram(zemberek.core.collections.Histogram) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) Collator(java.text.Collator) BufferedReader(java.io.BufferedReader) SpaceTabTokenizer(zemberek.core.SpaceTabTokenizer)

Example 3 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class TurkishSentenceAnalyzerTest method doParseSentencesInCorpus.

private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException {
    List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList();
    Stopwatch sw = Stopwatch.createStarted();
    long wc = 0;
    int s = 0;
    Histogram<String> unknownStuff = new Histogram<>();
    for (String sentence : sentences) {
        SentenceAnalysis parse = parser.analyze(sentence);
        for (SentenceAnalysis.Entry entry : parse) {
            List<WordAnalysis> parses = entry.parses;
            for (WordAnalysis wordAnalysis : parses) {
                if (wordAnalysis.dictionaryItem == DictionaryItem.UNKNOWN) {
                    unknownStuff.add(wordAnalysis.getSurfaceForm());
                }
            }
        }
        wc += parse.size();
        // parser.disambiguate(parse);
        s++;
        if (s % 10000 == 0) {
            System.out.println(s);
            System.out.println(sw.elapsed(TimeUnit.MILLISECONDS) / 1000d);
        }
    }
    try (PrintWriter pw = new PrintWriter("unknown.txt", "utf-8")) {
        for (String s1 : unknownStuff.getSortedList()) {
            pw.println(s1 + " " + unknownStuff.getCount(s1));
        }
    }
    System.out.println("Word count = " + wc);
    System.out.println("Elapsed Time =" + sw.elapsed(TimeUnit.MILLISECONDS));
    System.out.println("Parse and disambiguate per second = " + (wc * 1000d) / (sw.elapsed(TimeUnit.MILLISECONDS)));
}
Also used : Histogram(zemberek.core.collections.Histogram) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) Stopwatch(com.google.common.base.Stopwatch) SentenceAnalysis(zemberek.morphology.analysis.SentenceAnalysis) PrintWriter(java.io.PrintWriter)

Example 4 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class SpeedTest method testNewsCorpus.

@Test
@Ignore(value = "Speed Test.")
public void testNewsCorpus() throws IOException {
    // Path p = Paths.get("/media/aaa/Data/corpora/me-sentences/www.aljazeera.com.tr/2018-02-22");
    Path p = Paths.get("src/test/resources/corpora/cnn-turk-10k");
    List<String> sentences = getSentences(p);
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    Stopwatch sw = Stopwatch.createStarted();
    int tokenCount = 0;
    int noAnalysis = 0;
    int sentenceCount = 0;
    Histogram<String> failedWords = new Histogram<>(100000);
    for (String sentence : sentences) {
        List<Token> tokens = TurkishTokenizer.DEFAULT.tokenize(sentence);
        for (Token token : tokens) {
            if (token.getType() == Token.Type.Punctuation) {
                continue;
            }
            tokenCount++;
            WordAnalysis results = morphology.analyze(token.getText());
            if (!results.isCorrect()) {
                noAnalysis++;
                failedWords.add(token.getText());
            }
        }
        sentenceCount++;
        if (sentenceCount % 2000 == 0) {
            Log.info("%d tokens analyzed.", tokenCount);
        }
    }
    double seconds = sw.stop().elapsed(TimeUnit.MILLISECONDS) / 1000d;
    double speed = tokenCount / seconds;
    double parseRatio = 100 - (noAnalysis * 100d / tokenCount);
    Log.info("%nElapsed = %.2f seconds", seconds);
    Log.info("%nToken Count (No Punc) = %d %nParse Ratio = %.4f%nSpeed = %.2f tokens/sec%n", tokenCount, parseRatio, speed);
    Log.info("Saving Unknown Tokens");
    failedWords.saveSortedByCounts(Paths.get("unk.freq"), " ");
    failedWords.saveSortedByKeys(Paths.get("unk"), " ", Turkish.STRING_COMPARATOR_ASC);
}
Also used : Path(java.nio.file.Path) Histogram(zemberek.core.collections.Histogram) Stopwatch(com.google.common.base.Stopwatch) Token(zemberek.tokenization.Token) TurkishMorphology(zemberek.morphology.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with Histogram

use of zemberek.core.collections.Histogram in project zemberek-nlp by ahmetaa.

the class UnsupervisedKeyPhraseExtractor method collectCorpusStatistics.

static CorpusStatistics collectCorpusStatistics(WebCorpus corpus) throws IOException {
    CorpusStatistics statistics = new CorpusStatistics(1_000_000);
    for (WebDocument document : corpus.getDocuments()) {
        Histogram<String> docHistogram = new Histogram<>();
        List<String> sentences = extractor.fromParagraphs(document.getLines());
        for (String sentence : sentences) {
            List<Token> tokens = lexer.tokenize(sentence);
            for (Token token : tokens) {
                if (!tokenTypeAccpetable(token)) {
                    continue;
                }
                String s = normalize(token.getText());
                if (TurkishStopWords.DEFAULT.contains(s)) {
                    continue;
                }
                docHistogram.add(s);
            }
        }
        statistics.termFrequencies.add(docHistogram);
        for (String s : docHistogram) {
            statistics.documentFrequencies.add(s);
        }
    }
    statistics.documentCount = corpus.documentCount();
    return statistics;
}
Also used : Histogram(zemberek.core.collections.Histogram) WebDocument(zemberek.corpus.WebDocument) Token(zemberek.tokenization.Token)

Aggregations

Histogram (zemberek.core.collections.Histogram)39 WordAnalysis (zemberek.morphology.analysis.WordAnalysis)17 Path (java.nio.file.Path)15 SingleAnalysis (zemberek.morphology.analysis.SingleAnalysis)14 ArrayList (java.util.ArrayList)13 TurkishMorphology (zemberek.morphology.TurkishMorphology)12 Token (zemberek.tokenization.Token)12 Stopwatch (com.google.common.base.Stopwatch)11 PrintWriter (java.io.PrintWriter)11 LinkedHashSet (java.util.LinkedHashSet)11 SentenceAnalysis (zemberek.morphology.analysis.SentenceAnalysis)11 IOException (java.io.IOException)10 Files (java.nio.file.Files)10 Paths (java.nio.file.Paths)10 List (java.util.List)10 Collectors (java.util.stream.Collectors)10 TurkishTokenizer (zemberek.tokenization.TurkishTokenizer)10 StandardCharsets (java.nio.charset.StandardCharsets)9 HashSet (java.util.HashSet)9 Log (zemberek.core.logging.Log)9