Search in sources :

Example 1 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class BrownCorpusReader method getNext.

@Override
public void getNext(CAS cas) throws IOException, CollectionException {
    super.getNext(cas);
    JCas jcas;
    try {
        jcas = cas.getJCas();
    } catch (CASException e) {
        throw new CollectionException(e);
    }
    for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
        TextClassificationSequence sequence = new TextClassificationSequence(jcas, sentence.getBegin(), sentence.getEnd());
        sequence.addToIndexes();
        for (Token token : JCasUtil.selectCovered(jcas, Token.class, sentence)) {
            TextClassificationTarget unit = new TextClassificationTarget(jcas, token.getBegin(), token.getEnd());
            // will add the token content as a suffix to the ID of this unit
            unit.setSuffix(token.getCoveredText());
            unit.addToIndexes();
            TextClassificationOutcome outcome = new TextClassificationOutcome(jcas, token.getBegin(), token.getEnd());
            outcome.setOutcome(getTextClassificationOutcome(jcas, unit));
            outcome.addToIndexes();
        }
    }
}
Also used : CollectionException(org.apache.uima.collection.CollectionException) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) CASException(org.apache.uima.cas.CASException) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 2 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class CosineFeatureExtractor method extract.

@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
    try {
        TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
        TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
        // Note: getSimilarity(String, String) is *not* a convenience
        // method for getSimilarity(Collection<String>, Collection<String>).
        Set<String> text1 = NGramUtils.getDocumentNgrams(view1, aTarget1, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
        Set<String> text2 = NGramUtils.getDocumentNgrams(view2, aTarget2, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
        double similarity = measure.getSimilarity(text1, text2);
        // Temporary fix for DKPro Similarity Issue 30
        if (Double.isNaN(similarity)) {
            similarity = 0.0;
        }
        return new Feature("Similarity" + measure.getName(), similarity, FeatureType.NUMERIC).asSet();
    } catch (SimilarityException e) {
        throw new TextClassificationException(e);
    }
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) SimilarityException(dkpro.similarity.algorithms.api.SimilarityException) Feature(org.dkpro.tc.api.features.Feature)

Example 3 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class CharacterNGramMC method getNgramsFD.

@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) {
    TextClassificationTarget fullDoc = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
    FrequencyDistribution<String> fd = getAnnotationCharacterNgrams(fullDoc, lowerCase, ngramMinN, ngramMaxN, '^', '$');
    return fd;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget)

Example 4 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class WordNGramMC method getNgramsFD.

@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) throws TextClassificationException {
    TextClassificationTarget fullDoc = new TextClassificationTarget(jcas, 0, jcas.getDocumentText().length());
    FrequencyDistribution<String> fd = null;
    fd = NGramUtils.getDocumentNgrams(jcas, fullDoc, ngramLowerCase, filterPartialStopwordMatches, ngramMinN, ngramMaxN, stopwords, Token.class);
    return fd;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)

Example 5 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class KeywordNGramUtils method getMultipleViewKeywordNgrams.

public static FrequencyDistribution<String> getMultipleViewKeywordNgrams(List<JCas> jcases, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
    FrequencyDistribution<String> viewNgramsTotal = new FrequencyDistribution<String>();
    for (JCas view : jcases) {
        TextClassificationTarget aTarget = JCasUtil.selectSingle(view, TextClassificationTarget.class);
        FrequencyDistribution<String> oneViewsNgrams = getDocumentKeywordNgrams(view, aTarget, minN, maxN, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
        // This is a hack because there's no method to combine 2 FD's
        for (String key : oneViewsNgrams.getKeys()) {
            viewNgramsTotal.addSample(key, oneViewsNgrams.getCount(key));
        }
    }
    return viewNgramsTotal;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)61 JCas (org.apache.uima.jcas.JCas)29 ArrayList (java.util.ArrayList)22 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)18 Feature (org.dkpro.tc.api.features.Feature)16 Test (org.junit.Test)16 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)12 TextClassificationSequence (org.dkpro.tc.api.type.TextClassificationSequence)12 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)11 JCasId (org.dkpro.tc.api.type.JCasId)11 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)8 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)7 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)7 FeatureTestUtil.assertFeature (org.dkpro.tc.testing.FeatureTestUtil.assertFeature)6 CollectionReader (org.apache.uima.collection.CollectionReader)5 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)5 DocumentMetaData (de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)4 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)4 OpenNlpPosTagger (de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger)4 BreakIteratorSegmenter (de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter)4