use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class BrownClusterFeature method init.
private void init() throws TextClassificationException {
if (map != null) {
return;
}
map = new HashMap<String, String>();
try {
BufferedReader bf = openFile();
String line = null;
while ((line = bf.readLine()) != null) {
String[] split = line.split("\t");
map.put(split[1], split[0]);
}
} catch (Exception e) {
throw new TextClassificationException(e);
}
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class SimilarityPairFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
try {
double similarity;
switch(textSimilarityResource.getMode()) {
case text:
similarity = textSimilarityResource.getSimilarity(view1.getDocumentText(), view2.getDocumentText());
break;
case jcas:
similarity = ((JCasTextSimilarityMeasure) textSimilarityResource).getSimilarity(view1, view2);
break;
default:
List<String> f1 = getItems(view1);
List<String> f2 = getItems(view2);
// Remove "_" tokens
for (int i = f1.size() - 1; i >= 0; i--) {
if (f1.get(i) == null || f1.get(i).equals("_")) {
f1.remove(i);
}
}
for (int i = f2.size() - 1; i >= 0; i--) {
if (f2.get(i) == null || f2.get(i).equals("_")) {
f2.remove(i);
}
}
similarity = textSimilarityResource.getSimilarity(f1, f2);
}
return new Feature("Similarity" + textSimilarityResource.getName(), similarity, FeatureType.NUMERIC).asSet();
} catch (FeaturePathException e) {
throw new TextClassificationException(e);
} catch (SimilarityException e) {
throw new TextClassificationException(e);
}
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class CosineFeatureExtractor method extract.
@Override
public Set<Feature> extract(JCas view1, JCas view2) throws TextClassificationException {
try {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
// Note: getSimilarity(String, String) is *not* a convenience
// method for getSimilarity(Collection<String>, Collection<String>).
Set<String> text1 = NGramUtils.getDocumentNgrams(view1, aTarget1, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
Set<String> text2 = NGramUtils.getDocumentNgrams(view2, aTarget2, true, false, 1, 1, stopwords, ngramAnnotationType).getKeys();
double similarity = measure.getSimilarity(text1, text2);
// Temporary fix for DKPro Similarity Issue 30
if (Double.isNaN(similarity)) {
similarity = 0.0;
}
return new Feature("Similarity" + measure.getName(), similarity, FeatureType.NUMERIC).asSet();
} catch (SimilarityException e) {
throw new TextClassificationException(e);
}
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class PhoneticNGramMC method getDocumentPhoneticNgrams.
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
StringEncoder encoder;
String languageCode = jcas.getDocumentLanguage();
if (languageCode.equals("en")) {
encoder = new Soundex();
} else if (languageCode.equals("de")) {
encoder = new ColognePhonetic();
} else {
throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
}
FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
List<String> phoneticStrings = new ArrayList<String>();
for (Token t : selectCovered(jcas, Token.class, s)) {
try {
phoneticStrings.add(encoder.encode(t.getCoveredText()));
} catch (EncoderException e) {
throw new TextClassificationException(e);
}
}
String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return phoneticNgrams;
}
use of org.dkpro.tc.api.exception.TextClassificationException in project dkpro-tc by dkpro.
the class LuceneCPMetaCollectorBase method process.
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
JCas view1;
JCas view2;
try {
view1 = jcas.getView(Constants.PART_ONE);
view2 = jcas.getView(Constants.PART_TWO);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
List<JCas> jcases = new ArrayList<JCas>();
jcases.add(view1);
jcases.add(view2);
FrequencyDistribution<String> view1NGrams;
FrequencyDistribution<String> view2NGrams;
FrequencyDistribution<String> documentNGrams;
try {
TextClassificationTarget aTarget1 = JCasUtil.selectSingle(view1, TextClassificationTarget.class);
TextClassificationTarget aTarget2 = JCasUtil.selectSingle(view2, TextClassificationTarget.class);
view1NGrams = getNgramsFDView1(view1, aTarget1);
view2NGrams = getNgramsFDView2(view2, aTarget2);
documentNGrams = getNgramsFD(jcases);
} catch (TextClassificationException e) {
throw new AnalysisEngineProcessException(e);
}
for (String ngram : documentNGrams.getKeys()) {
for (int i = 0; i < documentNGrams.getCount(ngram); i++) {
addField(getFieldName(), ngram);
}
}
for (String ngram : view1NGrams.getKeys()) {
for (int i = 0; i < view1NGrams.getCount(ngram); i++) {
addField(getFieldNameView1(), ngram);
}
}
for (String ngram : view2NGrams.getKeys()) {
for (int i = 0; i < view2NGrams.getCount(ngram); i++) {
addField(getFieldNameView2(), ngram);
}
}
for (String ngram1 : view1NGrams.getKeys()) {
for (String ngram2 : view2NGrams.getKeys()) {
int combinedSize = ngram1.split(NGRAM_GLUE).length + ngram2.split(NGRAM_GLUE).length;
if (combinedSize <= getNgramMaxNCombo() && combinedSize >= getNgramMinNCombo()) {
// set count = 1, for doc freq and not total term freq
long count = view1NGrams.getCount(ngram1) * view2NGrams.getCount(ngram2);
for (int i = 0; i < count; i++) {
addField(getFieldNameCombo(), ngram1 + ComboUtils.JOINT + ngram2);
}
}
}
}
}
Aggregations