Search in sources :

Example 6 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class NGramUtils method getDocumentNgrams.

/**
 * Returns document ngrams over any annotation type that extends Annotation. Intended use is
 * Lemma, Stem, etc.
 *
 * @param jcas
 *            a jcas
 * @param aTarget
 *            target annotation span
 * @param lowerCaseNGrams
 *            lower caseing
 * @param filterPartialMatches
 *            filter partial matches
 * @param minN
 *            minimal n
 * @param maxN
 *            maximal n
 * @param stopwords
 *            set of stopwords
 * @param annotationClass
 *            annotation type of the ngram
 * @return a frequency distribution
 *
 * @throws TextClassificationException
 *             when an exception occurs
 */
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, Annotation aTarget, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) {
        List<String> strings = valuesToText(jcas, s, annotationClass.getName());
        for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                documentNgrams.inc(ngramString);
            }
        }
    }
    return documentNgrams;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 7 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class DeepLearningMajorityClass2OutcomeReport method determineMajorityClass.

private void determineMajorityClass(File f) throws Exception {
    FrequencyDistribution<String> fd = new FrequencyDistribution<>();
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));
        String line = null;
        while ((line = reader.readLine()) != null) {
            String[] split = line.split(" ");
            for (String v : split) {
                fd.addSample(v, 1);
            }
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
    majorityClass = fd.getSampleWithMaxFreq();
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 8 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class PosNGramMC method sentenceBasedDistribution.

private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
        List<String> postagstrings = new ArrayList<String>();
        for (POS p : selectCovered(jcas, POS.class, s)) {
            if (useCanonical) {
                postagstrings.add(p.getClass().getSimpleName());
            } else {
                postagstrings.add(p.getPosValue());
            }
        }
        String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
            posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 9 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class PosNGramMC method documentBasedDistribution.

private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
    FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
    List<String> postagstrings = new ArrayList<String>();
    for (POS p : selectCovered(jcas, POS.class, focus)) {
        if (useCanonical) {
            postagstrings.add(p.getClass().getSimpleName());
        } else {
            postagstrings.add(p.getPosValue());
        }
    }
    String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
    for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
        posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
    }
    return posNgrams;
}
Also used : POS(de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS) ArrayList(java.util.ArrayList) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 10 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class SkipWordNGramMC method getDocumentSkipNgrams.

public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, Annotation anno, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
        for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN, skipN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                documentNgrams.inc(ngramString);
            }
        }
    }
    return documentNgrams;
}
Also used : SkipNgramStringListIterable(org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)18 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 NGramStringListIterable (de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable)6 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)5 BufferedReader (java.io.BufferedReader)4 FileInputStream (java.io.FileInputStream)4 InputStreamReader (java.io.InputStreamReader)4 ArrayList (java.util.ArrayList)4 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)4 JCas (org.apache.uima.jcas.JCas)3 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)3 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)2 Fields (org.apache.lucene.index.Fields)2 IndexReader (org.apache.lucene.index.IndexReader)2 MultiFields (org.apache.lucene.index.MultiFields)2 Terms (org.apache.lucene.index.Terms)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 BytesRef (org.apache.lucene.util.BytesRef)2 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)2 SkipNgramStringListIterable (org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable)2