use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class LuceneNGramCPFE method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
FrequencyDistribution<String> view1Ngrams = null;
FrequencyDistribution<String> view2Ngrams = null;
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
view1Ngrams = NGramUtils.getDocumentNgrams(view1, aTarget1, ngramLowerCase, filterPartialStopwordMatches, ngramMinN1, ngramMaxN1, stopwords, Token.class);
view2Ngrams = NGramUtils.getDocumentNgrams(view2, aTarget2, ngramLowerCase, filterPartialStopwordMatches, ngramMinN2, ngramMaxN2, stopwords, Token.class);
FrequencyDistribution<String> documentComboNgrams = ComboUtils.getCombinedNgrams(view1Ngrams, view2Ngrams, ngramMinNCombo, ngramMaxNCombo, ngramUseSymmetricalCombos);
Set<Feature> features = new HashSet<Feature>();
prefix = "comboNG";
features = addToFeatureArray(documentComboNgrams, topKSetCombo, features);
return features;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class ComboUtils method getMultipleViewNgrams.
public static FrequencyDistribution<String> getMultipleViewNgrams(List<JCas> jcases, Annotation classificationUnit, boolean ngramLowerCase, boolean filterPartialStopwords, int ngramMinN, int ngramMaxN, Set<String> stopwords) throws TextClassificationException {
FrequencyDistribution<String> viewNgramsTotal = new FrequencyDistribution<String>();
for (JCas view : jcases) {
FrequencyDistribution<String> oneViewsNgrams = new FrequencyDistribution<String>();
TextClassificationTarget aTarget = JCasUtil.selectSingle(view, TextClassificationTarget.class);
if (classificationUnit == null) {
oneViewsNgrams = NGramUtils.getDocumentNgrams(view, aTarget, ngramLowerCase, filterPartialStopwords, ngramMinN, ngramMaxN, stopwords, Token.class);
} else {
oneViewsNgrams = NGramUtils.getAnnotationNgrams(view, classificationUnit, ngramLowerCase, filterPartialStopwords, ngramMinN, ngramMaxN, stopwords);
}
// This is a hack because there's no method to combine 2 FD's
for (String key : oneViewsNgrams.getKeys()) {
viewNgramsTotal.addSample(key, oneViewsNgrams.getCount(key));
}
}
return viewNgramsTotal;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class LucenePMetaCollectorBase method process.
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
JCas view1;
JCas view2;
try {
view1 = jcas.getView(PART_ONE);
view2 = jcas.getView(PART_TWO);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
List<JCas> jcases = new ArrayList<JCas>();
jcases.add(view1);
jcases.add(view2);
FrequencyDistribution<String> view1NGrams;
FrequencyDistribution<String> view2NGrams;
FrequencyDistribution<String> documentNGrams;
try {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
view1NGrams = getNgramsFDView1(view1, aTarget1);
view2NGrams = getNgramsFDView2(view2, aTarget2);
documentNGrams = getNgramsFD(jcases);
} catch (TextClassificationException e) {
throw new AnalysisEngineProcessException(e);
}
for (String ngram : documentNGrams.getKeys()) {
for (int i = 0; i < documentNGrams.getCount(ngram); i++) {
addField(getFieldName(), ngram);
}
}
for (String ngram : view1NGrams.getKeys()) {
for (int i = 0; i < view1NGrams.getCount(ngram); i++) {
addField(getFieldNameView1(), ngram);
}
}
for (String ngram : view2NGrams.getKeys()) {
for (int i = 0; i < view2NGrams.getCount(ngram); i++) {
addField(getFieldNameView2(), ngram);
}
}
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class TcAnnotator method processUnit.
private void processUnit(JCas aJCas) throws AnalysisEngineProcessException {
Type type = aJCas.getCas().getTypeSystem().getType(nameUnit);
Collection<AnnotationFS> select = CasUtil.select(aJCas.getCas(), type);
List<AnnotationFS> unitAnnotation = new ArrayList<AnnotationFS>(select);
TextClassificationOutcome tco = null;
List<String> outcomes = new ArrayList<String>();
// iterate the units and set on each a prepared dummy outcome
for (AnnotationFS unit : unitAnnotation) {
TextClassificationTarget tcs = new TextClassificationTarget(aJCas, unit.getBegin(), unit.getEnd());
tcs.addToIndexes();
tco = new TextClassificationOutcome(aJCas, unit.getBegin(), unit.getEnd());
tco.setOutcome(Constants.TC_OUTCOME_DUMMY_VALUE);
tco.addToIndexes();
engine.process(aJCas);
// store the outcome
outcomes.add(tco.getOutcome());
tcs.removeFromIndexes();
tco.removeFromIndexes();
}
// iterate again to set for each unit the outcome
for (int i = 0; i < unitAnnotation.size(); i++) {
AnnotationFS unit = unitAnnotation.get(i);
tco = new TextClassificationOutcome(aJCas, unit.getBegin(), unit.getEnd());
tco.setOutcome(outcomes.get(i));
tco.addToIndexes();
}
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class TcAnnotator method processDocument.
private void processDocument(JCas aJCas) throws AnalysisEngineProcessException {
if (!JCasUtil.exists(aJCas, TextClassificationTarget.class)) {
TextClassificationTarget aTarget = new TextClassificationTarget(aJCas, 0, aJCas.getDocumentText().length());
aTarget.addToIndexes();
}
// we need an outcome annotation to be present
if (!JCasUtil.exists(aJCas, TextClassificationOutcome.class)) {
TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas);
outcome.setOutcome("");
outcome.addToIndexes();
}
// loaded from the model
try {
engine.process(aJCas);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
Aggregations