Search in sources :

Example 1 with NGram

use of smile.nlp.NGram in project smile by haifengl.

the class CooccurrenceKeywordExtractorTest method testExtract.

/**
     * Test of extract method, of class KeywordExtractorTest.
     */
@Test
public void testExtract() throws FileNotFoundException {
    System.out.println("extract");
    Scanner scanner = new Scanner(smile.data.parser.IOUtils.getTestDataReader("text/turing.txt"));
    String text = scanner.useDelimiter("\\Z").next();
    scanner.close();
    CooccurrenceKeywordExtractor instance = new CooccurrenceKeywordExtractor();
    ArrayList<NGram> result = instance.extract(text);
    assertEquals(10, result.size());
    for (NGram ngram : result) {
        System.out.println(ngram);
    }
}
Also used : Scanner(java.util.Scanner) NGram(smile.nlp.NGram) Test(org.junit.Test)

Example 2 with NGram

use of smile.nlp.NGram in project smile by haifengl.

the class CooccurrenceKeywordExtractor method extract.

/**
     * Returns a given number of top keywords.
     * @param text A single document.
     * @return The top keywords.
     */
public ArrayList<NGram> extract(String text, int maxNumKeywords) {
    ArrayList<String[]> sentences = new ArrayList<>();
    SimpleTokenizer tokenizer = new SimpleTokenizer();
    PorterStemmer stemmer = new PorterStemmer();
    // Split text into sentences. Stem words by Porter algorithm.
    int ntotal = 0;
    for (String paragraph : SimpleParagraphSplitter.getInstance().split(text)) {
        for (String s : SimpleSentenceSplitter.getInstance().split(paragraph)) {
            String[] sentence = tokenizer.split(s);
            for (int i = 0; i < sentence.length; i++) {
                sentence[i] = stemmer.stripPluralParticiple(sentence[i]).toLowerCase();
            }
            sentences.add(sentence);
            ntotal += sentence.length;
        }
    }
    //  Extract phrases by Apriori-like algorithm.
    int maxNGramSize = 4;
    ArrayList<NGram> terms = new ArrayList<>();
    AprioriPhraseExtractor phraseExtractor = new AprioriPhraseExtractor();
    for (ArrayList<NGram> ngrams : phraseExtractor.extract(sentences, maxNGramSize, 4)) {
        for (NGram ngram : ngrams) {
            terms.add(ngram);
        }
    }
    Collections.sort(terms);
    // Select upto 30% most frequent terms.
    int n = 3 * terms.size() / 10;
    NGram[] freqTerms = new NGram[n];
    for (int i = 0, start = terms.size() - n; i < n; i++) {
        freqTerms[i] = terms.get(start + i);
    }
    // Trie for phrase matching.
    Trie<String, Integer> trie = new Trie<>();
    for (int i = 0; i < n; i++) {
        trie.put(freqTerms[i].words, i);
    }
    // Build co-occurrence table
    int[] nw = new int[n];
    int[][] table = new int[n][n];
    for (String[] sentence : sentences) {
        Set<Integer> phrases = new HashSet<>();
        for (int j = 1; j <= maxNGramSize; j++) {
            for (int i = 0; i <= sentence.length - j; i++) {
                String[] phrase = Arrays.copyOfRange(sentence, i, i + j);
                Integer index = trie.get(phrase);
                if (index != null) {
                    phrases.add(index);
                }
            }
        }
        for (int i : phrases) {
            nw[i] += phrases.size();
            for (int j : phrases) {
                if (i != j) {
                    table[i][j]++;
                }
            }
        }
    }
    // Clustering frequent terms.
    int[] cluster = new int[n];
    for (int i = 0; i < cluster.length; i++) {
        cluster[i] = i;
    }
    //double log2 = Math.log(2.0);
    for (int i = 0; i < n; i++) {
        for (int j = i + 1; j < n; j++) {
            // Mutual information
            if (table[i][j] > 0) {
                // This doesn't work as ntotal is usually large and thus the mutual information
                // is way larger than the threshold log2 given in the paper.
                //double mutual = Math.log((double) ntotal * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq));
                // Here we just use the (squared) geometric average of co-occurrence probability
                // It works well to clustering things like "digital computer" and "computer" in practice.
                double mutual = (double) table[i][j] * table[i][j] / (freqTerms[i].freq * freqTerms[j].freq);
                if (mutual >= 0.25) {
                    cluster[j] = cluster[i];
                }
            /*else {
                        double js = 0.0; // Jsensen-Shannon divergence
                        for (int k = 0; k < n; k++) {
                            double p1 = (double) table[i][k] / freqTerms[i].freq;
                            double p2 = (double) table[j][k] / freqTerms[j].freq;

                            // The formula in the paper is not correct as p is not real probablity.
                            if (p1 > 0 && p2 > 0) {
                                js += -(p1+p2) * Math.log((p1+p2)/2.0) + p1 * Math.log(p1) + p2 * Math.log(p2);
                            }
                        }
                    
                        js /= 2.0;
                        if (js > log2) {
                            cluster[j] = cluster[i];
                        }
                    }*/
            }
        }
    }
    // Calculate expected probability
    double[] pc = new double[n];
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            pc[cluster[j]] += table[i][j];
        }
    }
    for (int i = 0; i < n; i++) {
        pc[i] /= ntotal;
    }
    // Calculate chi-square scores.
    double[] score = new double[n];
    for (int i = 0; i < n; i++) {
        double max = Double.NEGATIVE_INFINITY;
        for (int j = 0; j < n; j++) {
            if (cluster[j] != j) {
                continue;
            }
            double fwc = 0.0;
            for (int k = 0; k < n; k++) {
                if (cluster[k] == j)
                    fwc += table[i][k];
            }
            double expected = nw[i] * pc[j];
            double d = (fwc - expected);
            double chisq = d * d / expected;
            score[i] += chisq;
            if (chisq > max)
                max = chisq;
        }
    //score[i] -= max;
    }
    int[] index = QuickSort.sort(score);
    ArrayList<NGram> keywords = new ArrayList<>();
    for (int i = n; i-- > 0; ) {
        boolean add = true;
        // filter out components of phrases, e.g. "digital" in "digital computer".
        for (int j = i + 1; j < n; j++) {
            if (cluster[index[j]] == cluster[index[i]]) {
                if (freqTerms[index[j]].words.length >= freqTerms[index[i]].words.length) {
                    add = false;
                    break;
                } else {
                    keywords.remove(freqTerms[index[j]]);
                    add = true;
                }
            }
        }
        if (add) {
            keywords.add(freqTerms[index[i]]);
            if (keywords.size() >= maxNumKeywords)
                break;
        }
    }
    return keywords;
}
Also used : ArrayList(java.util.ArrayList) NGram(smile.nlp.NGram) PorterStemmer(smile.nlp.stemmer.PorterStemmer) SimpleTokenizer(smile.nlp.tokenizer.SimpleTokenizer) AprioriPhraseExtractor(smile.nlp.collocation.AprioriPhraseExtractor) Trie(smile.nlp.Trie) HashSet(java.util.HashSet)

Example 3 with NGram

use of smile.nlp.NGram in project smile by haifengl.

the class AprioriPhraseExtractorTest method testExtract.

/**
     * Test of extract method, of class AprioriPhraseExtractorTest.
     */
@Test
public void testExtract() throws FileNotFoundException {
    System.out.println("extract");
    Scanner scanner = new Scanner(smile.data.parser.IOUtils.getTestDataReader("text/turing.txt"));
    String text = scanner.useDelimiter("\\Z").next();
    scanner.close();
    PorterStemmer stemmer = new PorterStemmer();
    SimpleTokenizer tokenizer = new SimpleTokenizer();
    ArrayList<String[]> sentences = new ArrayList<>();
    for (String paragraph : SimpleParagraphSplitter.getInstance().split(text)) {
        for (String s : SimpleSentenceSplitter.getInstance().split(paragraph)) {
            String[] sentence = tokenizer.split(s);
            for (int i = 0; i < sentence.length; i++) {
                sentence[i] = stemmer.stripPluralParticiple(sentence[i]).toLowerCase();
            }
            sentences.add(sentence);
        }
    }
    AprioriPhraseExtractor instance = new AprioriPhraseExtractor();
    ArrayList<ArrayList<NGram>> result = instance.extract(sentences, 4, 4);
    assertEquals(5, result.size());
    for (ArrayList<NGram> ngrams : result) {
        for (NGram ngram : ngrams) {
            System.out.print(ngram);
        }
        System.out.println();
    }
}
Also used : Scanner(java.util.Scanner) PorterStemmer(smile.nlp.stemmer.PorterStemmer) SimpleTokenizer(smile.nlp.tokenizer.SimpleTokenizer) ArrayList(java.util.ArrayList) NGram(smile.nlp.NGram) Test(org.junit.Test)

Example 4 with NGram

use of smile.nlp.NGram in project smile by haifengl.

the class AprioriPhraseExtractor method extract.

/** Extracts n-gram phrases.
     * 
     * @param sentences A collection of sentences (already split).
     * @param maxNGramSize The maximum length of n-gram
     * @param minFrequency The minimum frequency of n-gram in the sentences.
     * @return An array list of sets of n-grams. The i-th entry is the set of i-grams.
     */
public ArrayList<ArrayList<NGram>> extract(Collection<String[]> sentences, int maxNGramSize, int minFrequency) {
    ArrayList<Set<NGram>> features = new ArrayList<>(maxNGramSize + 1);
    features.add(new HashSet<>());
    for (int n = 1; n <= maxNGramSize; n++) {
        Map<NGram, Integer> candidates = new HashMap<>();
        Set<NGram> feature = new HashSet<>();
        features.add(feature);
        Set<NGram> feature_1 = features.get(n - 1);
        for (String[] sentence : sentences) {
            for (int i = 0; i <= sentence.length - n; i++) {
                NGram ngram = new NGram(Arrays.copyOfRange(sentence, i, i + n));
                boolean add = false;
                if (n == 1) {
                    add = true;
                } else {
                    NGram initialGram = new NGram(Arrays.copyOfRange(sentence, i, i + n - 1));
                    NGram finalGram = new NGram(Arrays.copyOfRange(sentence, i + 1, i + n));
                    if (feature_1.contains(initialGram) && feature_1.contains(finalGram)) {
                        add = true;
                    }
                }
                if (add) {
                    if (candidates.containsKey(ngram)) {
                        candidates.put(ngram, candidates.get(ngram) + 1);
                    } else {
                        candidates.put(ngram, 1);
                    }
                }
            }
        }
        for (Map.Entry<NGram, Integer> entry : candidates.entrySet()) {
            if (entry.getValue() >= minFrequency) {
                NGram ngram = entry.getKey();
                if (ngram.words.length == 1 && EnglishPunctuations.getInstance().contains(ngram.words[0])) {
                    continue;
                }
                ngram.freq = entry.getValue();
                feature.add(ngram);
            }
        }
    }
    // filter out stop words
    ArrayList<ArrayList<NGram>> results = new ArrayList<>();
    for (Set<NGram> ngrams : features) {
        ArrayList<NGram> result = new ArrayList<>();
        results.add(result);
        for (NGram ngram : ngrams) {
            boolean stopWord = true;
            if (!EnglishStopWords.DEFAULT.contains(ngram.words[0]) && !EnglishStopWords.DEFAULT.contains(ngram.words[ngram.words.length - 1])) {
                for (String word : ngram.words) {
                    if (!EnglishStopWords.DEFAULT.contains(word)) {
                        stopWord = false;
                        break;
                    }
                }
            }
            if (!stopWord) {
                result.add(ngram);
            }
        }
        Collections.sort(result);
        Collections.reverse(result);
    }
    return results;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) NGram(smile.nlp.NGram) Map(java.util.Map) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Aggregations

NGram (smile.nlp.NGram)4 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)2 Scanner (java.util.Scanner)2 Test (org.junit.Test)2 PorterStemmer (smile.nlp.stemmer.PorterStemmer)2 SimpleTokenizer (smile.nlp.tokenizer.SimpleTokenizer)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Set (java.util.Set)1 Trie (smile.nlp.Trie)1 AprioriPhraseExtractor (smile.nlp.collocation.AprioriPhraseExtractor)1