Search in sources :

Example 1 with TermFreqTuple

use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.

the class LuceneNGramPFE method getTopNgrams.

private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    topN.add(new TermFreqTuple(term, freq));
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }
    return topNGrams;
}
Also used : Terms(org.apache.lucene.index.Terms) TermFreqTuple(org.dkpro.tc.features.ngram.util.TermFreqTuple) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 2 with TermFreqTuple

use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.

the class LuceneFeatureExtractorBase method getTopNgrams.

@Override
protected FrequencyDistribution<String> getTopNgrams() throws ResourceInitializationException {
    if (topNGrams != null && !forceRereadFromIndex) {
        return topNGrams;
    }
    maxNgramSum = 0;
    topN = readIndex();
    topNGrams = new FrequencyDistribution<String>();
    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        long absCount = tuple.getFreq();
        double relFrequency = ((double) absCount) / maxNgramSum;
        if (relFrequency >= ngramFreqThreshold) {
            topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
        }
    }
    logSelectionProcess(topNGrams.getB());
    return topNGrams;
}
Also used : TermFreqTuple(org.dkpro.tc.features.ngram.util.TermFreqTuple)

Example 3 with TermFreqTuple

use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.

the class LuceneFeatureExtractorBase method readIndex.

private MinMaxPriorityQueue<TermFreqTuple> readIndex() throws ResourceInitializationException {
    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(getTopN()).create();
    IndexReader reader;
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            IOUtils.closeQuietly(reader);
            return topN;
        }
        Terms terms = fields.terms(getFieldName());
        if (terms == null) {
            IOUtils.closeQuietly(reader);
            return topN;
        }
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            long freq = termsEnum.totalTermFreq();
            if (passesScreening(term)) {
                topN.add(new TermFreqTuple(term, freq));
                maxNgramSum += freq;
            }
        }
        reader.close();
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    return topN;
}
Also used : Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) IndexReader(org.apache.lucene.index.IndexReader) Terms(org.apache.lucene.index.Terms) TermFreqTuple(org.dkpro.tc.features.ngram.util.TermFreqTuple) BytesRef(org.apache.lucene.util.BytesRef) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 4 with TermFreqTuple

use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.

the class LuceneNGramCPFE method getTopNgramsCombo.

private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    // add conditions here, like ngram1 is in most freq ngrams1...
                    String combo1 = term.split(ComboUtils.JOINT)[0];
                    String combo2 = term.split(ComboUtils.JOINT)[1];
                    int combinedSize = combo1.split("_").length + combo2.split("_").length;
                    if (topKSetView1.contains(combo1) && topKSet.contains(combo1) && topKSetView2.contains(combo2) && topKSet.contains(combo2) && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) {
                        // print out here for testing
                        topN.add(new TermFreqTuple(term, freq));
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }
    return topNGrams;
}
Also used : Terms(org.apache.lucene.index.Terms) TermFreqTuple(org.dkpro.tc.features.ngram.util.TermFreqTuple) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

TermFreqTuple (org.dkpro.tc.features.ngram.util.TermFreqTuple)4 Fields (org.apache.lucene.index.Fields)3 IndexReader (org.apache.lucene.index.IndexReader)3 MultiFields (org.apache.lucene.index.MultiFields)3 Terms (org.apache.lucene.index.Terms)3 TermsEnum (org.apache.lucene.index.TermsEnum)3 BytesRef (org.apache.lucene.util.BytesRef)3 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)3 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)2 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)2