use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class BrownCorpusReader method getNext.
@Override
public void getNext(CAS cas) throws IOException, CollectionException {
super.getNext(cas);
JCas jcas;
try {
jcas = cas.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
TextClassificationSequence sequence = new TextClassificationSequence(jcas, sentence.getBegin(), sentence.getEnd());
sequence.addToIndexes();
for (Token token : JCasUtil.selectCovered(jcas, Token.class, sentence)) {
TextClassificationTarget unit = new TextClassificationTarget(jcas, token.getBegin(), token.getEnd());
// will add the token content as a suffix to the ID of this unit
unit.setSuffix(token.getCoveredText());
unit.addToIndexes();
TextClassificationOutcome outcome = new TextClassificationOutcome(jcas, token.getBegin(), token.getEnd());
outcome.setOutcome(getTextClassificationOutcome(jcas, unit));
outcome.addToIndexes();
}
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class SimilarityPairFeatureTest method similarityPairFeatureTest.
@Test
public void similarityPairFeatureTest() throws Exception {
ExternalResourceDescription gstResource = ExternalResourceFactory.createExternalResourceDescription(GreedyStringTilingMeasureResource.class, GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3");
AnalysisEngineDescription desc = createEngineDescription(NoOpAnnotator.class);
AnalysisEngine engine = createEngine(desc);
JCas jcas = engine.newJCas();
TokenBuilder<Token, Sentence> tb = new TokenBuilder<Token, Sentence>(Token.class, Sentence.class);
JCas view1 = jcas.createView(VIEW1);
view1.setDocumentLanguage("en");
tb.buildTokens(view1, "This is a test .");
JCas view2 = jcas.createView(VIEW2);
view2.setDocumentLanguage("en");
tb.buildTokens(view2, "Test is this .");
engine.process(jcas);
SimilarityPairFeatureExtractor extractor = FeatureUtil.createResource(SimilarityPairFeatureExtractor.class, SimilarityPairFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123", SimilarityPairFeatureExtractor.PARAM_SEGMENT_FEATURE_PATH, Token.class.getName(), SimilarityPairFeatureExtractor.PARAM_TEXT_SIMILARITY_RESOURCE, gstResource);
Set<Feature> features = extractor.extract(jcas.getView(VIEW1), jcas.getView(VIEW2));
Assert.assertEquals(1, features.size());
Iterator<Feature> iter = features.iterator();
assertFeature("SimilarityGreedyStringTiling_3", 0.8125, iter.next(), 0.0001);
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class KeywordNGramUtils method getDocumentKeywordNgrams.
// all tokens should be already lowercased
/**
* Finds all minN- to maxN-length ngrams of tokens occurring in the keyword list. All tokens
* should already be lowercased, if applicable. The keyword list can contain multi-token words
* like "Brussel sprouts". If keyword list contains both "Brussel" and "Brussel sprouts", then
* only "Brussel sprouts" will be added. Otherwise, the smallest multiword matching keyword will
* be added.
*
* @param jcas
* a jcas
* @param anno
* the annotation
* @param minN
* minimum ngram length
* @param maxN
* maximum ngram length
* @param markSentenceBoundary
* mark the boundary of a sentence
* @param markSentenceLocation
* mark the location of a sentence
* @param includeCommas
* include commas
* @param keywords
* list of keywords
* @return all ngrams of keywords in jcas
*/
public static FrequencyDistribution<String> getDocumentKeywordNgrams(JCas jcas, Annotation anno, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
List<String> keywordList = new ArrayList<String>();
int sentenceNumber = 0;
int totalSentences = selectCovered(jcas, Sentence.class, anno).size();
for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
List<Token> sentence = selectCovered(Token.class, s);
for (int tokenpointer = 0; tokenpointer < sentence.size(); tokenpointer++) {
String token = sentence.get(tokenpointer).getCoveredText();
token = token.toLowerCase();
String compositeNgram = "";
boolean foundComposite = false;
for (int i = tokenpointer; i >= 0; i--) {
compositeNgram = sentence.get(i).getCoveredText().toLowerCase() + " " + compositeNgram;
if (compositeNgram.endsWith(" ")) {
compositeNgram = compositeNgram.replace(" ", "");
}
if (keywords.contains(compositeNgram)) {
keywordList.add(compositeNgram.replace(" ", MIDNGRAMGLUE));
foundComposite = true;
}
}
if (!foundComposite && keywords.contains(token)) {
keywordList.add(token);
} else if (includeCommas && token.equals(",")) {
keywordList.add(COMMA);
}
}
String sentenceBoundary = SENTENCE_BOUNDARY;
if (markSentenceLocation) {
if (((double) sentenceNumber / totalSentences) < 0.25) {
sentenceBoundary = sentenceBoundary + "BEG";
} else if (((double) sentenceNumber / totalSentences) > 0.75) {
sentenceBoundary = sentenceBoundary + "END";
} else {
sentenceBoundary = sentenceBoundary + "MID";
}
}
if (markSentenceBoundary) {
keywordList.add(sentenceBoundary);
}
sentenceNumber++;
}
for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) {
String ngramString = StringUtils.join(ngram, GLUE);
documentNgrams.inc(ngramString);
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class SentenceRatioPerDocument method extract.
@Override
public Set<Feature> extract(JCas jcas, TextClassificationTarget aTarget) throws TextClassificationException {
long maxLen = getMax();
List<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, aTarget);
double ratio = getRatio(sentences.size(), maxLen);
return new Feature(FEATURE_NAME, ratio, FeatureType.NUMERIC).asSet();
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-tc by dkpro.
the class PhoneticNGramMC method getDocumentPhoneticNgrams.
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
StringEncoder encoder;
String languageCode = jcas.getDocumentLanguage();
if (languageCode.equals("en")) {
encoder = new Soundex();
} else if (languageCode.equals("de")) {
encoder = new ColognePhonetic();
} else {
throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
}
FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
List<String> phoneticStrings = new ArrayList<String>();
for (Token t : selectCovered(jcas, Token.class, s)) {
try {
phoneticStrings.add(encoder.encode(t.getCoveredText()));
} catch (EncoderException e) {
throw new TextClassificationException(e);
}
}
String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return phoneticNgrams;
}
Aggregations