Search in sources :

Example 41 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class BrownCorpusReader method getNext.

@Override
public void getNext(CAS cas) throws IOException, CollectionException {
    super.getNext(cas);
    JCas jcas;
    try {
        jcas = cas.getJCas();
    } catch (CASException e) {
        throw new CollectionException(e);
    }
    for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
        TextClassificationSequence sequence = new TextClassificationSequence(jcas, sentence.getBegin(), sentence.getEnd());
        sequence.addToIndexes();
        for (Token token : JCasUtil.selectCovered(jcas, Token.class, sentence)) {
            TextClassificationTarget unit = new TextClassificationTarget(jcas, token.getBegin(), token.getEnd());
            // will add the token content as a suffix to the ID of this unit
            unit.setSuffix(token.getCoveredText());
            unit.addToIndexes();
            TextClassificationOutcome outcome = new TextClassificationOutcome(jcas, token.getBegin(), token.getEnd());
            outcome.setOutcome(getTextClassificationOutcome(jcas, unit));
            outcome.addToIndexes();
        }
    }
}
Also used : CollectionException(org.apache.uima.collection.CollectionException) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) CASException(org.apache.uima.cas.CASException) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)

Example 42 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class SimilarityPairFeatureTest method similarityPairFeatureTest.

@Test
public void similarityPairFeatureTest() throws Exception {
    ExternalResourceDescription gstResource = ExternalResourceFactory.createExternalResourceDescription(GreedyStringTilingMeasureResource.class, GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3");
    AnalysisEngineDescription desc = createEngineDescription(NoOpAnnotator.class);
    AnalysisEngine engine = createEngine(desc);
    JCas jcas = engine.newJCas();
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, Sentence.class);
    JCas view1 = jcas.createView(VIEW1);
    view1.setDocumentLanguage("en");
    tb.buildTokens(view1, "This is a test .");
    JCas view2 = jcas.createView(VIEW2);
    view2.setDocumentLanguage("en");
    tb.buildTokens(view2, "Test is this .");
    engine.process(jcas);
    SimilarityPairFeatureExtractor extractor = FeatureUtil.createResource(SimilarityPairFeatureExtractor.class, SimilarityPairFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", SimilarityPairFeatureExtractor.PARAM_SEGMENT_FEATURE_PATH, Token.class.getName(), SimilarityPairFeatureExtractor.PARAM_TEXT_SIMILARITY_RESOURCE, gstResource);
    Set<Feature> features = extractor.extract(jcas.getView(VIEW1), jcas.getView(VIEW2));
    Assert.assertEquals(1, features.size());
    Iterator<Feature> iter = features.iterator();
    assertFeature("SimilarityGreedyStringTiling_3", 0.8125, iter.next(), 0.0001);
}
Also used : TokenBuilder(org.apache.uima.fit.testing.factory.TokenBuilder) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) JCas(org.apache.uima.jcas.JCas) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FeatureTestUtil.assertFeature(org.dkpro.tc.testing.FeatureTestUtil.assertFeature) Feature(org.dkpro.tc.api.features.Feature) SimilarityPairFeatureExtractor(org.dkpro.tc.features.pair.similarity.SimilarityPairFeatureExtractor) ExternalResourceDescription(org.apache.uima.resource.ExternalResourceDescription) AnalysisEngine(org.apache.uima.analysis_engine.AnalysisEngine) Test(org.junit.Test)

Example 43 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class KeywordNGramUtils method getDocumentKeywordNgrams.

// all tokens should be already lowercased
/**
 * Finds all minN- to maxN-length ngrams of tokens occurring in the keyword list. All tokens
 * should already be lowercased, if applicable. The keyword list can contain multi-token words
 * like "Brussel sprouts". If keyword list contains both "Brussel" and "Brussel sprouts", then
 * only "Brussel sprouts" will be added. Otherwise, the smallest multiword matching keyword will
 * be added.
 *
 * @param jcas
 *            a jcas
 * @param anno
 *            the annotation
 * @param minN
 *            minimum ngram length
 * @param maxN
 *            maximum ngram length
 * @param markSentenceBoundary
 *            mark the boundary of a sentence
 * @param markSentenceLocation
 *            mark the location of a sentence
 * @param includeCommas
 *            include commas
 * @param keywords
 *            list of keywords
 * @return all ngrams of keywords in jcas
 */
public static FrequencyDistribution<String> getDocumentKeywordNgrams(JCas jcas, Annotation anno, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
    FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
    List<String> keywordList = new ArrayList<String>();
    int sentenceNumber = 0;
    int totalSentences = selectCovered(jcas, Sentence.class, anno).size();
    for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
        List<Token> sentence = selectCovered(Token.class, s);
        for (int tokenpointer = 0; tokenpointer < sentence.size(); tokenpointer++) {
            String token = sentence.get(tokenpointer).getCoveredText();
            token = token.toLowerCase();
            String compositeNgram = "";
            boolean foundComposite = false;
            for (int i = tokenpointer; i >= 0; i--) {
                compositeNgram = sentence.get(i).getCoveredText().toLowerCase() + " " + compositeNgram;
                if (compositeNgram.endsWith(" ")) {
                    compositeNgram = compositeNgram.replace(" ", "");
                }
                if (keywords.contains(compositeNgram)) {
                    keywordList.add(compositeNgram.replace(" ", MIDNGRAMGLUE));
                    foundComposite = true;
                }
            }
            if (!foundComposite && keywords.contains(token)) {
                keywordList.add(token);
            } else if (includeCommas && token.equals(",")) {
                keywordList.add(COMMA);
            }
        }
        String sentenceBoundary = SENTENCE_BOUNDARY;
        if (markSentenceLocation) {
            if (((double) sentenceNumber / totalSentences) < 0.25) {
                sentenceBoundary = sentenceBoundary + "BEG";
            } else if (((double) sentenceNumber / totalSentences) > 0.75) {
                sentenceBoundary = sentenceBoundary + "END";
            } else {
                sentenceBoundary = sentenceBoundary + "MID";
            }
        }
        if (markSentenceBoundary) {
            keywordList.add(sentenceBoundary);
        }
        sentenceNumber++;
    }
    for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) {
        String ngramString = StringUtils.join(ngram, GLUE);
        documentNgrams.inc(ngramString);
    }
    return documentNgrams;
}
Also used : ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Example 44 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class SentenceRatioPerDocument method extract.

@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
    long maxLen = getMax();
    List<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, aTarget);
    double ratio = getRatio(sentences.size(), maxLen);
    return new Feature(FEATURE_NAME, ratio, FeatureType.NUMERIC).asSet();
}
Also used : Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) Feature(org.dkpro.tc.api.features.Feature)

Example 45 with Sentence

use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.

the class PhoneticNGramMC method getDocumentPhoneticNgrams.

public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
    StringEncoder encoder;
    String languageCode = jcas.getDocumentLanguage();
    if (languageCode.equals("en")) {
        encoder = new Soundex();
    } else if (languageCode.equals("de")) {
        encoder = new ColognePhonetic();
    } else {
        throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
    }
    FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
    for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
        List<String> phoneticStrings = new ArrayList<String>();
        for (Token t : selectCovered(jcas, Token.class, s)) {
            try {
                phoneticStrings.add(encoder.encode(t.getCoveredText()));
            } catch (EncoderException e) {
                throw new TextClassificationException(e);
            }
        }
        String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
        for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
            phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
        }
    }
    return phoneticNgrams;
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ArrayList(java.util.ArrayList) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) ColognePhonetic(org.apache.commons.codec.language.ColognePhonetic) NGramStringListIterable(de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable) StringEncoder(org.apache.commons.codec.StringEncoder) Soundex(org.apache.commons.codec.language.Soundex) EncoderException(org.apache.commons.codec.EncoderException) Sentence(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence) FrequencyDistribution(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)

Aggregations

Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)90 JCas (org.apache.uima.jcas.JCas)41 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)34 ArrayList (java.util.ArrayList)22 AnnotatorState (de.tudarmstadt.ukp.clarin.webanno.api.annotation.model.AnnotatorState)14 Type (org.apache.uima.cas.Type)12 AnnotationFS (org.apache.uima.cas.text.AnnotationFS)12 IOException (java.io.IOException)9 SourceDocument (de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument)8 POS (de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS)8 Test (org.junit.Test)8 HashMap (java.util.HashMap)7 TokenBuilder (org.apache.uima.fit.testing.factory.TokenBuilder)7 AnnotationException (de.tudarmstadt.ukp.clarin.webanno.api.annotation.exception.AnnotationException)6 WebAnnoCasUtil.getFirstSentence (de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getFirstSentence)6 AnnotationDocument (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationDocument)6 AnnotationFeature (de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature)6 FrequencyDistribution (de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution)6 CASException (org.apache.uima.cas.CASException)6 AutomationTypeAdapter (de.tudarmstadt.ukp.clarin.webanno.api.annotation.adapter.AutomationTypeAdapter)5