Search in sources :

Example 11 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class IdfPairMetaCollector method process.

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    JCas view1;
    JCas view2;
    try {
        view1 = jcas.getView(PART_ONE);
        view2 = jcas.getView(PART_TWO);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
    FrequencyDistribution<String> document1NGrams;
    FrequencyDistribution<String> document2NGrams;
    try {
        document1NGrams = getNgramsFD(view1);
        document2NGrams = getNgramsFD(view2);
    } catch (TextClassificationException e) {
        throw new AnalysisEngineProcessException(e);
    }
    FrequencyDistribution<String> documentNGrams = new FrequencyDistribution<String>();
    // This is different than other metacollectors.
    for (String key : document1NGrams.getKeys()) {
        documentNGrams.addSample(key, 1);
    }
    for (String key : document2NGrams.getKeys()) {
        documentNGrams.addSample(key, 1);
    }
    for (String ngram : documentNGrams.getKeys()) {
        for (int i = 0; i < documentNGrams.getCount(ngram); i++) {
            Field field = new Field(getFieldName(), ngram, fieldType);
            currentDocument.add(field);
        }
    }
    try {
        writeToIndex();
    } catch (IOException e) {
        throw new AnalysisEngineProcessException(e);
    }
}
Also used : Field(org.apache.lucene.document.Field) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) JCas(org.apache.uima.jcas.JCas) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) IOException(java.io.IOException) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 12 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class SkipCharacterNGramMC method getCharacterSkipNgrams.

public static FrequencyDistribution<String> getCharacterSkipNgrams(JCas jcas, Annotation target, boolean lowerCaseNGrams, int minN, int maxN, int skipN) {
    FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>();
    for (Token t : selectCovered(jcas, Token.class, target)) {
        String tokenText = t.getCoveredText();
        String[] charsTemp = tokenText.split("");
        String[] chars = new String[charsTemp.length + 1];
        for (int i = 0; i < charsTemp.length; i++) {
            chars[i] = charsTemp[i];
        }
        chars[0] = "^";
        chars[charsTemp.length] = "$";
        for (List<String> ngram : new SkipNgramStringListIterable(chars, minN, maxN, skipN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
            charNgrams.inc(ngramString);
        }
    }
    return charNgrams;
}
Also used : SkipNgramStringListIterable(org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 13 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class NGramUtils method getAnnotationNgrams.

public static FrequencyDistribution<String> getAnnotationNgrams(JCas jcas, Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) {
    FrequencyDistribution<String> annoNgrams = new FrequencyDistribution<String>();
    // if not, extract them from all tokens in the focusAnnotation
    if (selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
        for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
            for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) {
                if (lowerCaseNGrams) {
                    ngram = lower(ngram);
                }
                if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                    String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                    annoNgrams.inc(ngramString);
                }
            }
        }
    } else {
        for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
            if (lowerCaseNGrams) {
                ngram = lower(ngram);
            }
            if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
                String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
                annoNgrams.inc(ngramString);
            }
        }
    }
    return annoNgrams;
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 14 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class LibsvmDataFormatBaselineMajorityClassIdReport method determineMajorityClass.

private void determineMajorityClass(File file) throws Exception {
    FrequencyDistribution<String> fd = new FrequencyDistribution<>();
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
        String line = null;
        while ((line = reader.readLine()) != null) {
            if (line.isEmpty()) {
                continue;
            }
            String[] split = line.split("\t");
            fd.addSample(split[0], 1);
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
    majorityClass = fd.getSampleWithMaxFreq();
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 15 with FrequencyDistribution

use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.

the class CrfSuiteBaselineMajorityClassIdReport method determineMajorityClass.

private void determineMajorityClass(File file) throws Exception {
    FrequencyDistribution<String> fd = new FrequencyDistribution<>();
    BufferedReader reader = null;
    try {
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
        String line = null;
        while ((line = reader.readLine()) != null) {
            if (line.isEmpty()) {
                continue;
            }
            String[] split = line.split("\t");
            fd.addSample(split[0], 1);
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
    majorityClass = fd.getSampleWithMaxFreq();
}
Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) FileInputStream(java.io.FileInputStream) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)18 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)6 NGramStringListIterable (de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable)6 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)5 BufferedReader (java.io.BufferedReader)4 FileInputStream (java.io.FileInputStream)4 InputStreamReader (java.io.InputStreamReader)4 ArrayList (java.util.ArrayList)4 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)4 JCas (org.apache.uima.jcas.JCas)3 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)3 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)2 Fields (org.apache.lucene.index.Fields)2 IndexReader (org.apache.lucene.index.IndexReader)2 MultiFields (org.apache.lucene.index.MultiFields)2 Terms (org.apache.lucene.index.Terms)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 BytesRef (org.apache.lucene.util.BytesRef)2 TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)2 SkipNgramStringListIterable (org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable)2