Search in sources :

Example 1 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class KeywordNGramUtils method getMultipleViewKeywordNgrams.

public static FrequencyDistribution<String> getMultipleViewKeywordNgrams(List<JCas> jcases, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
    FrequencyDistribution<String> viewNgramsTotal = new FrequencyDistribution<String>();
    for (JCas view : jcases) {
        TextClassificationTarget aTarget = JCasUtil.selectSingle(view, TextClassificationTarget.class);
        FrequencyDistribution<String> oneViewsNgrams = getDocumentKeywordNgrams(view, aTarget, minN, maxN, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
        // This is a hack because there's no method to combine 2 FD's
        for (String key : oneViewsNgrams.getKeys()) {
            viewNgramsTotal.addSample(key, oneViewsNgrams.getCount(key));
        }
    }
    return viewNgramsTotal;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 2 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class KeywordNGramUtils method getDocumentKeywordNgrams.

// all tokens should be already lowercased
/**
 * Finds all minN- to maxN-length ngrams of tokens occurring in the keyword list. All tokens
 * should already be lowercased, if applicable. The keyword list can contain multi-token words
 * like "Brussel sprouts". If keyword list contains both "Brussel" and "Brussel sprouts", then
 * only "Brussel sprouts" will be added. Otherwise, the smallest multiword matching keyword will
 * be added.
 *
 * @param jcas
 *            a jcas
 * @param anno
 *            the annotation
 * @param minN
 *            minimum ngram length
 * @param maxN
 *            maximum ngram length
 * @param markSentenceBoundary
 *            mark the boundary of a sentence
 * @param markSentenceLocation
 *            mark the location of a sentence
 * @param includeCommas
 *            include commas
 * @param keywords
 *            list of keywords
 * @return all ngrams of keywords in jcas
 */
public static FrequencyDistribution<String> getDocumentKeywordNgrams(JCas jcas, Annotation anno, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    List<String> keywordList = new ArrayList<String>();
    int sentenceNumber = 0;
    int totalSentences = selectCovered(jcas, Sentence.class, anno).size();
    for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
        List<Token> sentence = selectCovered(Token.class, s);
        for (int tokenpointer = 0; tokenpointer < sentence.size(); tokenpointer++) {
            String token = sentence.get(tokenpointer).getCoveredText();
            token = token.toLowerCase();
            String compositeNgram = "";
            boolean foundComposite = false;
            for (int i = tokenpointer; i >= 0; i--) {
                compositeNgram = sentence.get(i).getCoveredText().toLowerCase() + " " + compositeNgram;
                if (compositeNgram.endsWith(" ")) {
                    compositeNgram = compositeNgram.replace(" ", "");
                }
                if (keywords.contains(compositeNgram)) {
                    keywordList.add(compositeNgram.replace(" ", MIDNGRAMGLUE));
                    foundComposite = true;
                }
            }
            if (!foundComposite && keywords.contains(token)) {
                keywordList.add(token);
            } else if (includeCommas && token.equals(",")) {
                keywordList.add(COMMA);
            }
        }
        String sentenceBoundary = SENTENCE_BOUNDARY;
        if (markSentenceLocation) {
            if (((double) sentenceNumber / totalSentences) < 0.25) {
                sentenceBoundary = sentenceBoundary + "BEG";
            } else if (((double) sentenceNumber / totalSentences) > 0.75) {
                sentenceBoundary = sentenceBoundary + "END";
            } else {
                sentenceBoundary = sentenceBoundary + "MID";
            }
        }
        if (markSentenceBoundary) {
            keywordList.add(sentenceBoundary);
        }
        sentenceNumber++;
    }
    for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) {
        String ngramString = StringUtils.join(ngram, GLUE);
        documentNgrams.inc(ngramString);
    }
    return documentNgrams;
}
Also used : ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 3 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class LuceneNGramPFE method getTopNgrams.

private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    topN.add(new TermFreqTuple(term, freq));
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }
    return topNGrams;
}
Also used : Terms(org.apache.lucene.index.Terms) TermFreqTuple(org.dkpro.tc.features.ngram.util.TermFreqTuple) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 4 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class PhoneticNGramMC method getDocumentPhoneticNgrams.

public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
    StringEncoder encoder;
    String languageCode = jcas.getDocumentLanguage();
    if (languageCode.equals("en")) {
        encoder = new Soundex();
    } else if (languageCode.equals("de")) {
        encoder = new ColognePhonetic();
    } else {
        throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
    }
    FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
        List<String> phoneticStrings = new ArrayList<String>();
        for (Token t : selectCovered(jcas, Token.class, s)) {
            try {
                phoneticStrings.add(encoder.encode(t.getCoveredText()));
            } catch (EncoderException e) {
                throw new TextClassificationException(e);
            }
        }
        String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
            phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return phoneticNgrams;
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) ColognePhonetic(org.apache.commons.codec.language.ColognePhonetic) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) StringEncoder(org.apache.commons.codec.StringEncoder) Soundex(org.apache.commons.codec.language.Soundex) EncoderException(org.apache.commons.codec.EncoderException) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 5 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class MaxTokenLenMC method getNgramsFD.

@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) throws TextClassificationException {
    FrequencyDistribution<String> fd = new FrequencyDistribution<>();
    Collection<Token> select = JCasUtil.select(jcas, Token.class);
    for (Token t : select) {
        fd.addSample(t.getCoveredText().length() + "_" + r.nextInt(), t.getCoveredText().length());
    }
    return fd;
}
Also used : Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)18 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 NGramStringListIterable (de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable)6 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)5 BufferedReader (java.io.BufferedReader)4 FileInputStream (java.io.FileInputStream)4 InputStreamReader (java.io.InputStreamReader)4 ArrayList (java.util.ArrayList)4 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)4 JCas (org.apache.uima.jcas.JCas)3 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)3 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)2 Fields (org.apache.lucene.index.Fields)2 IndexReader (org.apache.lucene.index.IndexReader)2 MultiFields (org.apache.lucene.index.MultiFields)2 Terms (org.apache.lucene.index.Terms)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 BytesRef (org.apache.lucene.util.BytesRef)2 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)2 SkipNgramStringListIterable (org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable)2