Search in sources :

Example 6 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class LuceneKeywordPFE method extract.

@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
    TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
    TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
    FrequencyDistribution<String> view1Ngrams = KeywordNGramUtils.getDocumentKeywordNgrams(view1, aTarget1, ngramMinN1, ngramMaxN1, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
    FrequencyDistribution<String> view2Ngrams = KeywordNGramUtils.getDocumentKeywordNgrams(view2, aTarget2, ngramMinN2, ngramMaxN2, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
    FrequencyDistribution<String> allNgrams = getViewNgrams(view1, view2);
    Set<Feature> features = new HashSet<Feature>();
    if (useView1NgramsAsFeatures) {
        prefix = "keyNG1";
        features = addToFeatureArray(view1Ngrams, topKSetView1, features);
    }
    if (useView2NgramsAsFeatures) {
        prefix = "keyNG2";
        features = addToFeatureArray(view2Ngrams, topKSetView2, features);
    }
    if (useViewBlindNgramsAsFeatures && !markViewBlindNgramsWithLocalView) {
        prefix = "keyNG";
        features = addToFeatureArray(allNgrams, topKSet, features);
    }
    if (useViewBlindNgramsAsFeatures && markViewBlindNgramsWithLocalView) {
        prefix = "keyNGall1";
        features = addToFeatureArray(view1Ngrams, topKSet, features);
        prefix = "keyNGall2";
        features = addToFeatureArray(view2Ngrams, topKSet, features);
    }
    return features;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 7 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class LuceneNGramPFE method extract.

@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
    FrequencyDistribution<String> view1Ngrams = null;
    FrequencyDistribution<String> view2Ngrams = null;
    FrequencyDistribution<String> allNgrams = null;
    TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
    TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
    view1Ngrams = NGramUtils.getDocumentNgrams(view1, aTarget1, ngramLowerCase, filterPartialStopwordMatches, ngramMinN1, ngramMaxN1, stopwords, Token.class);
    view2Ngrams = NGramUtils.getDocumentNgrams(view2, aTarget2, ngramLowerCase, filterPartialStopwordMatches, ngramMinN2, ngramMaxN2, stopwords, Token.class);
    allNgrams = getViewNgrams(view1, view2);
    Set<Feature> features = new HashSet<Feature>();
    if (useView1NgramsAsFeatures) {
        prefix = "view1NG";
        features = addToFeatureArray(view1Ngrams, topKSetView1, features);
    }
    if (useView2NgramsAsFeatures) {
        prefix = "view2NG";
        features = addToFeatureArray(view2Ngrams, topKSetView2, features);
    }
    if (useViewBlindNgramsAsFeatures && !markViewBlindNgramsWithLocalView) {
        prefix = "allNG";
        features = addToFeatureArray(allNgrams, topKSet, features);
    }
    if (useViewBlindNgramsAsFeatures && markViewBlindNgramsWithLocalView) {
        prefix = "view1allNG";
        features = addToFeatureArray(view1Ngrams, topKSet, features);
        prefix = "view2allNG";
        features = addToFeatureArray(view2Ngrams, topKSet, features);
    }
    return features;
}
Also used : TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) Token(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet)

Example 8 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class TestReaderSentenceToDocument method getNext.

@Override
public void getNext(JCas aJCas) throws IOException, CollectionException {
    // setting the document text
    aJCas.setDocumentText(texts.get(offset));
    aJCas.setDocumentLanguage(LANGUAGE_CODE);
    // as we are creating more than one CAS out of a single file, we need to have different
    // document titles and URIs for each CAS
    // otherwise, serialized CASes will be overwritten
    DocumentMetaData dmd = DocumentMetaData.create(aJCas);
    dmd.setDocumentTitle("Sentence" + offset);
    dmd.setDocumentUri("Sentence" + offset);
    dmd.setDocumentId(String.valueOf(offset));
    JCasId id = new JCasId(aJCas);
    id.setId(jcasId);
    id.addToIndexes();
    // setting the outcome / label for this document
    TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas);
    outcome.setOutcome(getTextClassificationOutcome(aJCas));
    outcome.addToIndexes();
    new TextClassificationTarget(aJCas, 0, aJCas.getDocumentText().length()).addToIndexes();
    offset++;
}
Also used : JCasId(org.dkpro.tc.api.type.JCasId) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) DocumentMetaData(de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)

Example 9 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class LuceneCPMetaCollectorBase method process.

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    JCas view1;
    JCas view2;
    try {
        view1 = jcas.getView(Constants.PART_ONE);
        view2 = jcas.getView(Constants.PART_TWO);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
    List<JCas> jcases = new ArrayList<JCas>();
    jcases.add(view1);
    jcases.add(view2);
    FrequencyDistribution<String> view1NGrams;
    FrequencyDistribution<String> view2NGrams;
    FrequencyDistribution<String> documentNGrams;
    try {
        TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
        TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
        view1NGrams = getNgramsFDView1(view1, aTarget1);
        view2NGrams = getNgramsFDView2(view2, aTarget2);
        documentNGrams = getNgramsFD(jcases);
    } catch (TextClassificationException e) {
        throw new AnalysisEngineProcessException(e);
    }
    for (String ngram : documentNGrams.getKeys()) {
        for (int i = 0; i < documentNGrams.getCount(ngram); i++) {
            addField(getFieldName(), ngram);
        }
    }
    for (String ngram : view1NGrams.getKeys()) {
        for (int i = 0; i < view1NGrams.getCount(ngram); i++) {
            addField(getFieldNameView1(), ngram);
        }
    }
    for (String ngram : view2NGrams.getKeys()) {
        for (int i = 0; i < view2NGrams.getCount(ngram); i++) {
            addField(getFieldNameView2(), ngram);
        }
    }
    for (String ngram1 : view1NGrams.getKeys()) {
        for (String ngram2 : view2NGrams.getKeys()) {
            int combinedSize = ngram1.split(NGRAM_GLUE).length + ngram2.split(NGRAM_GLUE).length;
            if (combinedSize <= getNgramMaxNCombo() && combinedSize >= getNgramMinNCombo()) {
                // set count = 1, for doc freq and not total term freq
                long count = view1NGrams.getCount(ngram1) * view2NGrams.getCount(ngram2);
                for (int i = 0; i < count; i++) {
                    addField(getFieldNameCombo(), ngram1 + ComboUtils.JOINT + ngram2);
                }
            }
        }
    }
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ArrayList(java.util.ArrayList) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException)

Example 10 with TextClassificationTarget

use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.

the class NGramUtilsTest method phoneticNgramsTest.

@Test
public void phoneticNgramsTest() throws Exception {
    String text = "This is a big house";
    JCas jcas = JCasFactory.createJCas();
    jcas.setDocumentLanguage("en");
    jcas.setDocumentText(text);
    TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, text.length());
    aTarget.addToIndexes();
    JCasBuilder cb = new JCasBuilder(jcas);
    for (String token : text.split(" ")) {
        cb.add(token, Token.class);
    }
    cb.add(0, Sentence.class);
    FrequencyDistribution<String> ngrams = PhoneticNGramMC.getDocumentPhoneticNgrams(jcas, aTarget, 1, 3);
    assertEquals(12, ngrams.getN());
    assertTrue(ngrams.contains("I000"));
    assertTrue(ngrams.contains("T200"));
}
Also used : JCasBuilder(org.apache.uima.fit.factory.JCasBuilder) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) JCas(org.apache.uima.jcas.JCas) Test(org.junit.Test)

Aggregations

TextClassificationTarget (org.dkpro.tc.api.type.TextClassificationTarget)61 JCas (org.apache.uima.jcas.JCas)29 ArrayList (java.util.ArrayList)22 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)18 Feature (org.dkpro.tc.api.features.Feature)16 Test (org.junit.Test)16 AnalysisEngine (org.apache.uima.analysis_engine.AnalysisEngine)12 TextClassificationSequence (org.dkpro.tc.api.type.TextClassificationSequence)12 Token (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token)11 JCasId (org.dkpro.tc.api.type.JCasId)11 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)8 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)7 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)7 FeatureTestUtil.assertFeature (org.dkpro.tc.testing.FeatureTestUtil.assertFeature)6 CollectionReader (org.apache.uima.collection.CollectionReader)5 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)5 DocumentMetaData (de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData)4 Sentence (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence)4 OpenNlpPosTagger (de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger)4 BreakIteratorSegmenter (de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter)4