use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class LuceneKeywordPFE method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
FrequencyDistribution<String> view1Ngrams = KeywordNGramUtils.getDocumentKeywordNgrams(view1, aTarget1, ngramMinN1, ngramMaxN1, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
FrequencyDistribution<String> view2Ngrams = KeywordNGramUtils.getDocumentKeywordNgrams(view2, aTarget2, ngramMinN2, ngramMaxN2, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
FrequencyDistribution<String> allNgrams = getViewNgrams(view1, view2);
Set<Feature> features = new HashSet<Feature>();
if (useView1NgramsAsFeatures) {
prefix = "keyNG1";
features = addToFeatureArray(view1Ngrams, topKSetView1, features);
}
if (useView2NgramsAsFeatures) {
prefix = "keyNG2";
features = addToFeatureArray(view2Ngrams, topKSetView2, features);
}
if (useViewBlindNgramsAsFeatures && !markViewBlindNgramsWithLocalView) {
prefix = "keyNG";
features = addToFeatureArray(allNgrams, topKSet, features);
}
if (useViewBlindNgramsAsFeatures && markViewBlindNgramsWithLocalView) {
prefix = "keyNGall1";
features = addToFeatureArray(view1Ngrams, topKSet, features);
prefix = "keyNGall2";
features = addToFeatureArray(view2Ngrams, topKSet, features);
}
return features;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class LuceneNGramPFE method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
FrequencyDistribution<String> view1Ngrams = null;
FrequencyDistribution<String> view2Ngrams = null;
FrequencyDistribution<String> allNgrams = null;
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
view1Ngrams = NGramUtils.getDocumentNgrams(view1, aTarget1, ngramLowerCase, filterPartialStopwordMatches, ngramMinN1, ngramMaxN1, stopwords, Token.class);
view2Ngrams = NGramUtils.getDocumentNgrams(view2, aTarget2, ngramLowerCase, filterPartialStopwordMatches, ngramMinN2, ngramMaxN2, stopwords, Token.class);
allNgrams = getViewNgrams(view1, view2);
Set<Feature> features = new HashSet<Feature>();
if (useView1NgramsAsFeatures) {
prefix = "view1NG";
features = addToFeatureArray(view1Ngrams, topKSetView1, features);
}
if (useView2NgramsAsFeatures) {
prefix = "view2NG";
features = addToFeatureArray(view2Ngrams, topKSetView2, features);
}
if (useViewBlindNgramsAsFeatures && !markViewBlindNgramsWithLocalView) {
prefix = "allNG";
features = addToFeatureArray(allNgrams, topKSet, features);
}
if (useViewBlindNgramsAsFeatures && markViewBlindNgramsWithLocalView) {
prefix = "view1allNG";
features = addToFeatureArray(view1Ngrams, topKSet, features);
prefix = "view2allNG";
features = addToFeatureArray(view2Ngrams, topKSet, features);
}
return features;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class TestReaderSentenceToDocument method getNext.
@Override
public void getNext(JCas aJCas) throws IOException, CollectionException {
// setting the document text
aJCas.setDocumentText(texts.get(offset));
aJCas.setDocumentLanguage(LANGUAGE_CODE);
// as we are creating more than one CAS out of a single file, we need to have different
// document titles and URIs for each CAS
// otherwise, serialized CASes will be overwritten
DocumentMetaData dmd = DocumentMetaData.create(aJCas);
dmd.setDocumentTitle("Sentence" + offset);
dmd.setDocumentUri("Sentence" + offset);
dmd.setDocumentId(String.valueOf(offset));
JCasId id = new JCasId(aJCas);
id.setId(jcasId);
id.addToIndexes();
// setting the outcome / label for this document
TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas);
outcome.setOutcome(getTextClassificationOutcome(aJCas));
outcome.addToIndexes();
new TextClassificationTarget(aJCas, 0, aJCas.getDocumentText().length()).addToIndexes();
offset++;
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class LuceneCPMetaCollectorBase method process.
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
JCas view1;
JCas view2;
try {
view1 = jcas.getView(Constants.PART_ONE);
view2 = jcas.getView(Constants.PART_TWO);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
List<JCas> jcases = new ArrayList<JCas>();
jcases.add(view1);
jcases.add(view2);
FrequencyDistribution<String> view1NGrams;
FrequencyDistribution<String> view2NGrams;
FrequencyDistribution<String> documentNGrams;
try {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
view1NGrams = getNgramsFDView1(view1, aTarget1);
view2NGrams = getNgramsFDView2(view2, aTarget2);
documentNGrams = getNgramsFD(jcases);
} catch (TextClassificationException e) {
throw new AnalysisEngineProcessException(e);
}
for (String ngram : documentNGrams.getKeys()) {
for (int i = 0; i < documentNGrams.getCount(ngram); i++) {
addField(getFieldName(), ngram);
}
}
for (String ngram : view1NGrams.getKeys()) {
for (int i = 0; i < view1NGrams.getCount(ngram); i++) {
addField(getFieldNameView1(), ngram);
}
}
for (String ngram : view2NGrams.getKeys()) {
for (int i = 0; i < view2NGrams.getCount(ngram); i++) {
addField(getFieldNameView2(), ngram);
}
}
for (String ngram1 : view1NGrams.getKeys()) {
for (String ngram2 : view2NGrams.getKeys()) {
int combinedSize = ngram1.split(NGRAM_GLUE).length + ngram2.split(NGRAM_GLUE).length;
if (combinedSize <= getNgramMaxNCombo() && combinedSize >= getNgramMinNCombo()) {
// set count = 1, for doc freq and not total term freq
long count = view1NGrams.getCount(ngram1) * view2NGrams.getCount(ngram2);
for (int i = 0; i < count; i++) {
addField(getFieldNameCombo(), ngram1 + ComboUtils.JOINT + ngram2);
}
}
}
}
}
use of org.dkpro.tc.api.type.TextClassificationTarget in project dkpro-tc by dkpro.
the class NGramUtilsTest method phoneticNgramsTest.
@Test
public void phoneticNgramsTest() throws Exception {
String text = "This is a big house";
JCas jcas = JCasFactory.createJCas();
jcas.setDocumentLanguage("en");
jcas.setDocumentText(text);
TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, text.length());
aTarget.addToIndexes();
JCasBuilder cb = new JCasBuilder(jcas);
for (String token : text.split(" ")) {
cb.add(token, Token.class);
}
cb.add(0, Sentence.class);
FrequencyDistribution<String> ngrams = PhoneticNGramMC.getDocumentPhoneticNgrams(jcas, aTarget, 1, 3);
assertEquals(12, ngrams.getN());
assertTrue(ngrams.contains("I000"));
assertTrue(ngrams.contains("T200"));
}
Aggregations