use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project webanno by webanno.
the class RemoveZeroSizeTokensAndSentencesRepair method repair.
@Override
public void repair(Project aProject, CAS aCas, List<LogMessage> aMessages) {
try {
for (Sentence s : select(aCas.getJCas(), Sentence.class)) {
if (s.getBegin() >= s.getEnd()) {
s.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed sentence with illegal span: %s", s));
}
}
for (Token t : select(aCas.getJCas(), Token.class)) {
if (t.getBegin() >= t.getEnd()) {
Lemma lemma = t.getLemma();
if (lemma != null) {
lemma.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed lemma attached to token with illegal span: %s", t));
}
POS pos = t.getPos();
if (pos != null) {
pos.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed POS attached to token with illegal span: %s", t));
}
Stem stem = t.getStem();
if (stem != null) {
stem.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed stem attached to token with illegal span: %s", t));
}
t.removeFromIndexes();
aMessages.add(new LogMessage(this, LogLevel.INFO, "Removed token with illegal span: %s", t));
}
}
} catch (CASException e) {
log.error("Unabled to access JCas", e);
aMessages.add(new LogMessage(this, LogLevel.ERROR, "Unabled to access JCas", e.getMessage()));
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-lab by dkpro.
the class ExamplePosAnnotator method process.
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
Collection<TOP> addToIndexes = new ArrayList<TOP>();
// generate a list of training instances for each sentence in the document
for (Sentence sentence : select(jCas, Sentence.class)) {
List<Instance<String>> instances = new ArrayList<Instance<String>>();
List<Token> tokens = selectCovered(jCas, Token.class, sentence);
// for each token, extract all feature values and the label
for (Token token : tokens) {
Instance<String> instance = new Instance<String>();
// extract all features that require only the token annotation
for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
instance.addAll(extractor.extract(jCas, token));
}
// extract all features that require the token and sentence annotations
for (ContextExtractor<Token> extractor : this.contextFeatureExtractors) {
instance.addAll(extractor.extractWithin(jCas, token, sentence));
}
// set the instance label from the token's part of speech
if (this.isTraining()) {
instance.setOutcome(token.getPos().getPosValue());
}
// add the instance to the list
instances.add(instance);
}
if (this.isTraining()) {
// for training, write instances to the data write
this.dataWriter.write(instances);
} else {
// for classification, set the labels as the token POS labels
Iterator<Token> tokensIter = tokens.iterator();
List<String> labels = classify(instances);
for (String label : labels) {
Token t = tokensIter.next();
POS pos = t.getPos();
if (pos == null) {
pos = new POS(jCas, t.getBegin(), t.getEnd());
addToIndexes.add(pos);
t.setPos(pos);
}
pos.setPosValue(label);
}
}
for (TOP fs : addToIndexes) {
fs.addToIndexes();
}
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-tc by dkpro.
the class PosNGramMC method sentenceBasedDistribution.
private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, s)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return posNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-tc by dkpro.
the class PosNGramMC method documentBasedDistribution.
private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, focus)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
return posNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS in project dkpro-tc by dkpro.
the class ConversionAnnotator method process.
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
for (TextClassificationOutcome o : JCasUtil.select(aJCas, TextClassificationOutcome.class)) {
POS p = new POS(aJCas, o.getBegin(), o.getEnd());
String val = o.getOutcome();
if (suffix != null && !suffix.isEmpty()) {
val += suffix;
}
p.setPosValue(val);
p.addToIndexes();
}
}
Aggregations