use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class SimilarityPairFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
try {
double similarity;
switch(textSimilarityResource.getMode()) {
case text:
similarity = textSimilarityResource.getSimilarity(view1.getDocumentText(), view2.getDocumentText());
break;
case jcas:
similarity = ((JCasTextSimilarityMeasure) textSimilarityResource).getSimilarity(view1, view2);
break;
default:
List<String> f1 = getItems(view1);
List<String> f2 = getItems(view2);
// Remove "_" tokens
for (int i = f1.size() - 1; i >= 0; i--) {
if (f1.get(i) == null || f1.get(i).equals("_")) {
f1.remove(i);
}
}
for (int i = f2.size() - 1; i >= 0; i--) {
if (f2.get(i) == null || f2.get(i).equals("_")) {
f2.remove(i);
}
}
similarity = textSimilarityResource.getSimilarity(f1, f2);
}
return new Feature("Similarity" + textSimilarityResource.getName(), similarity, FeatureType.NUMERIC).asSet();
} catch (FeaturePathException e) {
throw new TextClassificationException(e);
} catch (SimilarityException e) {
throw new TextClassificationException(e);
}
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class SimilarityPairFeatureTest method similarityPairFeatureTest.
@Test
public void similarityPairFeatureTest() throws Exception {
ExternalResourceDescription gstResource = ExternalResourceFactory.createExternalResourceDescription(GreedyStringTilingMeasureResource.class, GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3");
AnalysisEngineDescription desc = createEngineDescription(NoOpAnnotator.class);
AnalysisEngine engine = createEngine(desc);
JCas jcas = engine.newJCas();
TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, Sentence.class);
JCas view1 = jcas.createView(VIEW1);
view1.setDocumentLanguage("en");
tb.buildTokens(view1, "This is a test .");
JCas view2 = jcas.createView(VIEW2);
view2.setDocumentLanguage("en");
tb.buildTokens(view2, "Test is this .");
engine.process(jcas);
SimilarityPairFeatureExtractor extractor = FeatureUtil.createResource(SimilarityPairFeatureExtractor.class, SimilarityPairFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", SimilarityPairFeatureExtractor.PARAM_SEGMENT_FEATURE_PATH, Token.class.getName(), SimilarityPairFeatureExtractor.PARAM_TEXT_SIMILARITY_RESOURCE, gstResource);
Set<Feature> features = extractor.extract(jcas.getView(VIEW1), jcas.getView(VIEW2));
Assert.assertEquals(1, features.size());
Iterator<Feature> iter = features.iterator();
assertFeature("SimilarityGreedyStringTiling_3", 0.8125, iter.next(), 0.0001);
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class CosineFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
try {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
// Note: getSimilarity(String, String) is *not* a convenience
// method for getSimilarity(Collection<String>, Collection<String>).
Set<String> text1 = NGramUtils.getDocumentNgrams(view1, aTarget1, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
Set<String> text2 = NGramUtils.getDocumentNgrams(view2, aTarget2, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
double similarity = measure.getSimilarity(text1, text2);
// Temporary fix for DKPro Similarity Issue 30
if (Double.isNaN(similarity)) {
similarity = 0.0;
}
return new Feature("Similarity" + measure.getName(), similarity, FeatureType.NUMERIC).asSet();
} catch (SimilarityException e) {
throw new TextClassificationException(e);
}
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class PhoneticNGram method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
Set<Feature> features = new HashSet<Feature>();
FrequencyDistribution<String> documentNgrams = PhoneticNGramMC.getDocumentPhoneticNgrams(jcas, aTarget, ngramMinN, ngramMaxN);
for (String topNgram : topKSet.getKeys()) {
if (documentNgrams.getKeys().contains(topNgram)) {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 1, FeatureType.BOOLEAN));
} else {
features.add(new Feature(getFeaturePrefix() + "_" + topNgram, 0, true, FeatureType.BOOLEAN));
}
}
return features;
}
use of org.dkpro.tc.api.features.Feature in project dkpro-tc by dkpro.
the class CosineSimilarityTest method testCosSimWithLemmas.
@Test
public void testCosSimWithLemmas() throws Exception {
CosineSimilarityTest test = new CosineSimilarityTest();
test.initialize();
test.parameters = new Object[] { CosineFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", CosineFeatureExtractor.PARAM_SOURCE_LOCATION, test.lucenePath.toString(), IdfPairMetaCollector.PARAM_TARGET_LOCATION, test.lucenePath.toString(), CosineFeatureExtractor.PARAM_NGRAM_ANNO_TYPE, "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" };
test.runPipeline();
assertTrue(test.featureNames.first().equals("SimilarityCosineSimilarity"));
assertEquals(test.featureNames.size(), 1);
for (Feature feat : test.instanceList.get(0).getFeatures()) {
assertEquals(0.2, (double) feat.getValue(), epsilon);
}
}
Aggregations