Search in sources :

Example 1 with AprioriPhraseExtractor

use of smile.nlp.collocation.AprioriPhraseExtractor in project smile by haifengl.

the class CooccurrenceKeywordExtractor method extract.

/**
     * Returns a given number of top keywords.
     * @param text A single document.
     * @return The top keywords.
     */
public ArrayList<NGram> extract(String text, int maxNumKeywords) {
    ArrayList<String[]> sentences = new ArrayList<>();
    SimpleTokenizer tokenizer = new SimpleTokenizer();
    PorterStemmer stemmer = new PorterStemmer();
    // Split text into sentences. Stem words by Porter algorithm.
    int ntotal = 0;
    for (String paragraph : SimpleParagraphSplitter.getInstance().split(text)) {
        for (String s : SimpleSentenceSplitter.getInstance().split(paragraph)) {
            String[] sentence = tokenizer.split(s);
            for (int i = 0; i < sentence.length; i++) {
                sentence[i] = stemmer.stripPluralParticiple(sentence[i]).toLowerCase();
            }
            sentences.add(sentence);
            ntotal += sentence.length;
        }
    }
    //  Extract phrases by Apriori-like algorithm.
    int maxNGramSize = 4;
    ArrayList<NGram> terms = new ArrayList<>();
    AprioriPhraseExtractor phraseExtractor = new AprioriPhraseExtractor();
    for (ArrayList<NGram> ngrams : phraseExtractor.extract(sentences, maxNGramSize, 4)) {
        for (NGram ngram : ngrams) {
            terms.add(ngram);
        }
    }
    Collections.sort(terms);
    // Select upto 30% most frequent terms.
    int n = 3 * terms.size() / 10;
    NGram[] freqTerms = new NGram[n];
    for (int i = 0, start = terms.size() - n; i < n; i++) {
        freqTerms[i] = terms.get(start + i);
    }
    // Trie for phrase matching.
    Trie<String, Integer> trie = new Trie<>();
    for (int i = 0; i < n; i++) {
        trie.put(freqTerms[i].words, i);
    }
    // Build co-occurrence table
    int[] nw = new int[n];
    int[][] table = new int[n][n];
    for (String[] sentence : sentences) {
        Set<Integer> phrases = new HashSet<>();
        for (int j = 1; j <= maxNGramSize; j++) {
            for (int i = 0; i <= sentence.length - j; i++) {
                String[] phrase = Arrays.copyOfRange(sentence, i, i + j);
                Integer index = trie.get(phrase);
                if (index != null) {
                    phrases.add(index);
                }
            }
        }
        for (int i : phrases) {
            nw[i] += phrases.size();
            for (int j : phrases) {
                if (i != j) {
                    table[i][j]++;
                }
            }
        }
    }
    // Clustering frequent terms.
    int[] cluster = new int[n];
    for (int i = 0; i < cluster.length; i++) {
        cluster[i] = i;
    }
    //double log2 = Math.log(2.0);
    for (int i = 0; i < n; i++) {
        for (int j = i + 1; j < n; j++) {
            // Mutual information
            if (table[i][j] > 0) {
                // This doesn't work as ntotal is usually large and thus the mutual information
                // is way larger than the threshold log2 given in the paper.
                //double mutual = Math.log((double) ntotal * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq));
                // Here we just use the (squared) geometric average of co-occurrence probability
                // It works well to clustering things like "digital computer" and "computer" in practice.
                double mutual = (double) table[i][j] * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq);
                if (mutual >= 0.25) {
                    cluster[j] = cluster[i];
                }
            /*else {
                        double js = 0.0; // Jsensen-Shannon divergence
                        for (int k = 0; k < n; k++) {
                            double p1 = (double) table[i][k] / freqTerms[i].freq;
                            double p2 = (double) table[j][k] / freqTerms[j].freq;

                            // The formula in the paper is not correct as p is not real probablity.
                            if (p1 > 0 && p2 > 0) {
                                js += -(p1+p2) * Math.log((p1+p2)/2.0) + p1 * Math.log(p1) + p2 * Math.log(p2);
                            }
                        }
                    
                        js /= 2.0;
                        if (js > log2) {
                            cluster[j] = cluster[i];
                        }
                    }*/
            }
        }
    }
    // Calculate expected probability
    double[] pc = new double[n];
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            pc[cluster[j]] += table[i][j];
        }
    }
    for (int i = 0; i < n; i++) {
        pc[i] /= ntotal;
    }
    // Calculate chi-square scores.
    double[] score = new double[n];
    for (int i = 0; i < n; i++) {
        double max = Double.NEGATIVE_INFINITY;
        for (int j = 0; j < n; j++) {
            if (cluster[j] != j) {
                continue;
            }
            double fwc = 0.0;
            for (int k = 0; k < n; k++) {
                if (cluster[k] == j)
                    fwc += table[i][k];
            }
            double expected = nw[i] * pc[j];
            double d = (fwc - expected);
            double chisq = d * d / expected;
            score[i] += chisq;
            if (chisq > max)
                max = chisq;
        }
    //score[i] -= max;
    }
    int[] index = QuickSort.sort(score);
    ArrayList<NGram> keywords = new ArrayList<>();
    for (int i = n; i-- > 0; ) {
        boolean add = true;
        // filter out components of phrases, e.g. "digital" in "digital computer".
        for (int j = i + 1; j < n; j++) {
            if (cluster[index[j]] == cluster[index[i]]) {
                if (freqTerms[index[j]].words.length >= freqTerms[index[i]].words.length) {
                    add = false;
                    break;
                } else {
                    keywords.remove(freqTerms[index[j]]);
                    add = true;
                }
            }
        }
        if (add) {
            keywords.add(freqTerms[index[i]]);
            if (keywords.size() >= maxNumKeywords)
                break;
        }
    }
    return keywords;
}
Also used : ArrayList(java.util.ArrayList) NGram(smile.nlp.NGram) PorterStemmer(smile.nlp.stemmer.PorterStemmer) SimpleTokenizer(smile.nlp.tokenizer.SimpleTokenizer) AprioriPhraseExtractor(smile.nlp.collocation.AprioriPhraseExtractor) Trie(smile.nlp.Trie) HashSet(java.util.HashSet)

Aggregations

ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 NGram (smile.nlp.NGram)1 Trie (smile.nlp.Trie)1 AprioriPhraseExtractor (smile.nlp.collocation.AprioriPhraseExtractor)1 PorterStemmer (smile.nlp.stemmer.PorterStemmer)1 SimpleTokenizer (smile.nlp.tokenizer.SimpleTokenizer)1