Search in sources :

Example 1 with NGramStringListIterable

use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.

the class KeywordNGramUtils method getDocumentKeywordNgrams.

// all tokens should be already lowercased
/**
 * Finds all minN- to maxN-length ngrams of tokens occurring in the keyword list. All tokens
 * should already be lowercased, if applicable. The keyword list can contain multi-token words
 * like "Brussel sprouts". If keyword list contains both "Brussel" and "Brussel sprouts", then
 * only "Brussel sprouts" will be added. Otherwise, the smallest multiword matching keyword will
 * be added.
 *
 * @param jcas
 *            a jcas
 * @param anno
 *            the annotation
 * @param minN
 *            minimum ngram length
 * @param maxN
 *            maximum ngram length
 * @param markSentenceBoundary
 *            mark the boundary of a sentence
 * @param markSentenceLocation
 *            mark the location of a sentence
 * @param includeCommas
 *            include commas
 * @param keywords
 *            list of keywords
 * @return all ngrams of keywords in jcas
 */
public static FrequencyDistribution<String> getDocumentKeywordNgrams(JCas jcas, Annotation anno, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    List<String> keywordList = new ArrayList<String>();
    int sentenceNumber = 0;
    int totalSentences = selectCovered(jcas, Sentence.class, anno).size();
    for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
        List<Token> sentence = selectCovered(Token.class, s);
        for (int tokenpointer = 0; tokenpointer < sentence.size(); tokenpointer++) {
            String token = sentence.get(tokenpointer).getCoveredText();
            token = token.toLowerCase();
            String compositeNgram = "";
            boolean foundComposite = false;
            for (int i = tokenpointer; i >= 0; i--) {
                compositeNgram = sentence.get(i).getCoveredText().toLowerCase() + " " + compositeNgram;
                if (compositeNgram.endsWith(" ")) {
                    compositeNgram = compositeNgram.replace(" ", "");
                }
                if (keywords.contains(compositeNgram)) {
                    keywordList.add(compositeNgram.replace(" ", MIDNGRAMGLUE));
                    foundComposite = true;
                }
            }
            if (!foundComposite && keywords.contains(token)) {
                keywordList.add(token);
            } else if (includeCommas && token.equals(",")) {
                keywordList.add(COMMA);
            }
        }
        String sentenceBoundary = SENTENCE_BOUNDARY;
        if (markSentenceLocation) {
            if (((double) sentenceNumber / totalSentences) < 0.25) {
                sentenceBoundary = sentenceBoundary + "BEG";
            } else if (((double) sentenceNumber / totalSentences) > 0.75) {
                sentenceBoundary = sentenceBoundary + "END";
            } else {
                sentenceBoundary = sentenceBoundary + "MID";
            }
        }
        if (markSentenceBoundary) {
            keywordList.add(sentenceBoundary);
        }
        sentenceNumber++;
    }
    for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) {
        String ngramString = StringUtils.join(ngram, GLUE);
        documentNgrams.inc(ngramString);
    }
    return documentNgrams;
}
Also used : ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 2 with NGramStringListIterable

use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.

the class PhoneticNGramMC method getDocumentPhoneticNgrams.

public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
    StringEncoder encoder;
    String languageCode = jcas.getDocumentLanguage();
    if (languageCode.equals("en")) {
        encoder = new Soundex();
    } else if (languageCode.equals("de")) {
        encoder = new ColognePhonetic();
    } else {
        throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
    }
    FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
        List<String> phoneticStrings = new ArrayList<String>();
        for (Token t : selectCovered(jcas, Token.class, s)) {
            try {
                phoneticStrings.add(encoder.encode(t.getCoveredText()));
            } catch (EncoderException e) {
                throw new TextClassificationException(e);
            }
        }
        String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
            phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return phoneticNgrams;
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) ColognePhonetic(org.apache.commons.codec.language.ColognePhonetic) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) StringEncoder(org.apache.commons.codec.StringEncoder) Soundex(org.apache.commons.codec.language.Soundex) EncoderException(org.apache.commons.codec.EncoderException) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 3 with NGramStringListIterable

use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.

the class NGramUtils method getDocumentNgrams.

/**
 * Returns document ngrams over any annotation type that extends Annotation. Intended use is
 * Lemma, Stem, etc.
 *
 * @param jcas
 *            a jcas
 * @param aTarget
 *            target annotation span
 * @param lowerCaseNGrams
 *            lower caseing
 * @param filterPartialMatches
 *            filter partial matches
 * @param minN
 *            minimal n
 * @param maxN
 *            maximal n
 * @param stopwords
 *            set of stopwords
 * @param annotationClass
 *            annotation type of the ngram
 * @return a frequency distribution
 *
 * @throws TextClassificationException
 *             when an exception occurs
 */
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, Annotation aTarget, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) {
        List<String> strings = valuesToText(jcas, s, annotationClass.getName());
        for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                documentNgrams.inc(ngramString);
            }
        }
    }
    return documentNgrams;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 4 with NGramStringListIterable

use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.

the class PosNGramMC method sentenceBasedDistribution.

private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
        List<String> postagstrings = new ArrayList<String>();
        for (POS p : selectCovered(jcas, POS.class, s)) {
            if (useCanonical) {
                postagstrings.add(p.getClass().getSimpleName());
            } else {
                postagstrings.add(p.getPosValue());
            }
        }
        String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
            posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 5 with NGramStringListIterable

use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.

the class PosNGramMC method documentBasedDistribution.

private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    List<String> postagstrings = new ArrayList<String>();
    for (POS p : selectCovered(jcas, POS.class, focus)) {
        if (useCanonical) {
            postagstrings.add(p.getClass().getSimpleName());
        } else {
            postagstrings.add(p.getPosValue());
        }
    }
    String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
    for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
        posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 NGramStringListIterable (de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable)6 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)5 ArrayList (java.util.ArrayList)4 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)2 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)2 EncoderException (org.apache.commons.codec.EncoderException)1 StringEncoder (org.apache.commons.codec.StringEncoder)1 ColognePhonetic (org.apache.commons.codec.language.ColognePhonetic)1 Soundex (org.apache.commons.codec.language.Soundex)1 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)1