use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebAnnoTsv3WriterTestBase method makeJCasOneSentence.
private static JCas makeJCasOneSentence() throws UIMAException {
JCas jcas = makeJCas();
TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
tb.buildTokens(jcas, "This is a test .");
return jcas;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebAnnoTsv3WriterTestBase method testZeroLengthSpanBetweenAdjacentTokens.
@Test
public void testZeroLengthSpanBetweenAdjacentTokens() throws Exception {
JCas jcas = makeJCas();
jcas.setDocumentText("word.");
new Token(jcas, 0, 4).addToIndexes();
new Token(jcas, 4, 5).addToIndexes();
new Sentence(jcas, 0, 5).addToIndexes();
CAS cas = jcas.getCas();
Type simpleSpanType = cas.getTypeSystem().getType("webanno.custom.SimpleSpan");
// Insert zero-width annotation between the adjacent tokens (at end of first token).
AnnotationFS fs1a = cas.createAnnotation(simpleSpanType, 4, 4);
cas.addFsToIndexes(fs1a);
writeAndAssertEquals(jcas, WebannoTsv3Writer.PARAM_SPAN_LAYERS, asList("webanno.custom.SimpleSpan"));
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class Tsv3XSerializerTest method makeJCasOneSentence.
private JCas makeJCasOneSentence(String aText) throws UIMAException {
TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
TypeSystemDescription local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("src/test/resources/desc/type/webannoTestTypes.xml");
TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
JCas jcas = JCasFactory.createJCas(merged);
DocumentMetaData.create(jcas).setDocumentId("doc");
TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
tb.buildTokens(jcas, aText);
// sentence break
for (Sentence s : select(jcas, Sentence.class)) {
s.removeFromIndexes();
}
// Add a new sentence covering the whole text
new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes();
return jcas;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class RemoveZeroSizeTokensAndSentencesRepair method repair.
@Override
public void repair(Project aProject, CAS aCas, List<LogMessage> aMessages) {
try {
for (Sentence s : select(aCas.getJCas(), Sentence.class)) {
if (s.getBegin() >= s.getEnd()) {
s.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed sentence with illegal span: %s", s));
}
}
for (Token t : select(aCas.getJCas(), Token.class)) {
if (t.getBegin() >= t.getEnd()) {
Lemma lemma = t.getLemma();
if (lemma != null) {
lemma.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed lemma attached to token with illegal span: %s", t));
}
POS pos = t.getPos();
if (pos != null) {
pos.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed POS attached to token with illegal span: %s", t));
}
Stem stem = t.getStem();
if (stem != null) {
stem.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed stem attached to token with illegal span: %s", t));
}
t.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed token with illegal span: %s", t));
}
}
} catch (CASException e) {
log.error("Unabled to access JCas", e);
aMessages.add(new LogMessage(this, LogLevel.ERROR, "Unabled to access JCas", e.getMessage()));
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project dkpro-lab by dkpro.
the class ExamplePosAnnotator method process.
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
Collection<TOP> addToIndexes = new ArrayList<TOP>();
// generate a list of training instances for each sentence in the document
for (Sentence sentence : select(jCas, Sentence.class)) {
List<Instance<String>> instances = new ArrayList<Instance<String>>();
List<Token> tokens = selectCovered(jCas, Token.class, sentence);
// for each token, extract all feature values and the label
for (Token token : tokens) {
Instance<String> instance = new Instance<String>();
// extract all features that require only the token annotation
for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
instance.addAll(extractor.extract(jCas, token));
}
// extract all features that require the token and sentence annotations
for (ContextExtractor<Token> extractor : this.contextFeatureExtractors) {
instance.addAll(extractor.extractWithin(jCas, token, sentence));
}
// set the instance label from the token's part of speech
if (this.isTraining()) {
instance.setOutcome(token.getPos().getPosValue());
}
// add the instance to the list
instances.add(instance);
}
if (this.isTraining()) {
// for training, write instances to the data write
this.dataWriter.write(instances);
} else {
// for classification, set the labels as the token POS labels
Iterator<Token> tokensIter = tokens.iterator();
List<String> labels = classify(instances);
for (String label : labels) {
Token t = tokensIter.next();
POS pos = t.getPos();
if (pos == null) {
pos = new POS(jCas, t.getBegin(), t.getEnd());
addToIndexes.add(pos);
t.setPos(pos);
}
pos.setPosValue(label);
}
}
for (TOP fs : addToIndexes) {
fs.addToIndexes();
}
}
}
Aggregations