use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma in project webanno by webanno.
the class LemmaLayerInitializer method configure.
@Override
public void configure(Project aProject) throws IOException {
AnnotationLayer tokenLayer = annotationSchemaService.getLayer(Token.class.getName(), aProject);
AnnotationFeature tokenLemmaFeature = new AnnotationFeature(aProject, tokenLayer, "lemma", "lemma", Lemma.class.getName());
annotationSchemaService.createFeature(tokenLemmaFeature);
AnnotationLayer lemmaLayer = new AnnotationLayer(Lemma.class.getName(), "Lemma", SPAN_TYPE, aProject, true);
lemmaLayer.setAttachType(tokenLayer);
lemmaLayer.setAttachFeature(tokenLemmaFeature);
annotationSchemaService.createLayer(lemmaLayer);
AnnotationFeature lemmaFeature = new AnnotationFeature();
lemmaFeature.setDescription("lemma Annotation");
lemmaFeature.setName("value");
lemmaFeature.setType(CAS.TYPE_NAME_STRING);
lemmaFeature.setProject(aProject);
lemmaFeature.setUiName("Lemma");
lemmaFeature.setLayer(lemmaLayer);
annotationSchemaService.createFeature(lemmaFeature);
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma in project webanno by webanno.
the class Tsv3XDeserializer method getOrCreateSpanAnnotation.
private AnnotationFS getOrCreateSpanAnnotation(TsvColumn aCol, TsvUnit aUnit, int aStackingIndex, String aDisambiguationInfo) {
int disambiguationId = aDisambiguationInfo != null ? Integer.valueOf(aDisambiguationInfo) : -1;
// Check if we have seen the same annotation already in the current unit but in another
// column.
AnnotationFS annotation = aUnit.getUimaAnnotation(aCol.uimaType, aStackingIndex);
// If not, check if we have seen the same annotation already in a previous unit
if (annotation == null && disambiguationId != -1) {
annotation = aUnit.getDocument().getDisambiguatedAnnotation(disambiguationId);
if (annotation != null) {
aUnit.addUimaAnnotation(annotation);
// Extend the span of the existing annotation
// Unfortunately, the AnnotationFS interface does not define a setEnd() method.
setFeature(annotation, CAS.FEATURE_BASE_NAME_END, aUnit.getEnd());
}
}
// Still no annotation? Then we have to create one
if (annotation == null) {
annotation = aUnit.getDocument().getJCas().getCas().createAnnotation(aCol.uimaType, aUnit.getBegin(), aUnit.getEnd());
aUnit.addUimaAnnotation(annotation);
// Check if there are slot features that need to be initialized
List<TsvColumn> otherColumnsForType = aUnit.getDocument().getSchema().getColumns(aCol.uimaType);
for (TsvColumn col : otherColumnsForType) {
if (SLOT_TARGET.equals(col.featureType)) {
setFeature(annotation, col.uimaFeature.getShortName(), emptyList());
}
}
// Special handling of DKPro Core Token-attached annotations
if (Lemma.class.getName().equals(aCol.uimaType.getName())) {
TsvToken token = (TsvToken) aUnit;
token.getUimaToken().setLemma((Lemma) annotation);
}
if (Stem.class.getName().equals(aCol.uimaType.getName())) {
TsvToken token = (TsvToken) aUnit;
token.getUimaToken().setStem((Stem) annotation);
}
if (MorphologicalFeatures.class.getName().equals(aCol.uimaType.getName())) {
TsvToken token = (TsvToken) aUnit;
token.getUimaToken().setMorph((MorphologicalFeatures) annotation);
}
if (POS.class.getName().equals(aCol.uimaType.getName())) {
TsvToken token = (TsvToken) aUnit;
token.getUimaToken().setPos((POS) annotation);
}
}
// to extend the range of multi-token IDs.
if (disambiguationId != -1) {
aUnit.getDocument().addDisambiguationId(annotation, disambiguationId);
}
return annotation;
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma in project webanno by webanno.
the class Conll2009Reader method convert.
public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
if (readPos) {
try {
posMappingProvider.configure(aJCas.getCas());
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
}
JCasBuilder doc = new JCasBuilder(aJCas);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
// markers following each other.
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
// Tokens, Lemma, POS
Map<Integer, Token> tokens = new HashMap<Integer, Token>();
List<SemPred> preds = new ArrayList<>();
Iterator<String[]> wordIterator = words.iterator();
while (wordIterator.hasNext()) {
String[] word = wordIterator.next();
// Read token
Token token = doc.add(word[FORM], Token.class);
tokens.put(Integer.valueOf(word[ID]), token);
if (wordIterator.hasNext()) {
doc.add(" ");
}
// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
}
// Read part-of-speech tag
if (!UNUSED.equals(word[POS]) && readPos) {
Type posTag = posMappingProvider.getTagType(word[POS]);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
pos.setPosValue(word[POS].intern());
// WebAnno did not yet backport the coarse grained POS feature from
// DKPro Core 1.9.0
// POSUtils.assignCoarseValue(pos);
pos.addToIndexes();
token.setPos(pos);
}
// Read morphological features
if (!UNUSED.equals(word[FEAT]) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
morphtag.setValue(word[FEAT]);
morphtag.addToIndexes();
}
if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
pred.setCategory(word[PRED]);
pred.addToIndexes();
preds.add(pred);
}
sentenceEnd = token.getEnd();
}
// Dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// Model the root as a loop onto itself
if (govId == 0) {
// Not using ROOT here because WebAnno cannot deal with elevated
// types
Dependency rel = new Dependency(aJCas);
rel.setGovernor(tokens.get(depId));
rel.setDependent(tokens.get(depId));
rel.setDependencyType(word[DEPREL]);
rel.setBegin(rel.getDependent().getBegin());
rel.setEnd(rel.getDependent().getEnd());
// This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas
// classes
FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC);
rel.addToIndexes();
} else {
Dependency rel = new Dependency(aJCas);
rel.setGovernor(tokens.get(govId));
rel.setDependent(tokens.get(depId));
rel.setDependencyType(word[DEPREL]);
rel.setBegin(rel.getDependent().getBegin());
rel.setEnd(rel.getDependent().getEnd());
// This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas
// classes
FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC);
rel.addToIndexes();
}
}
}
}
// Semantic arguments
if (readSemanticPredicate) {
// Get arguments for one predicate at a time
for (int p = 0; p < preds.size(); p++) {
List<SemArgLink> args = new ArrayList<>();
for (String[] word : words) {
if (!UNUSED.equals(word[APRED + p])) {
Token token = tokens.get(Integer.valueOf(word[ID]));
SemArg arg = new SemArg(aJCas, token.getBegin(), token.getEnd());
arg.addToIndexes();
SemArgLink link = new SemArgLink(aJCas);
link.setRole(word[APRED + p]);
link.setTarget(arg);
args.add(link);
}
}
SemPred pred = preds.get(p);
pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
}
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma in project webanno by webanno.
the class ConllUReader method convert.
public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
if (readPos) {
try {
posMappingProvider.configure(aJCas.getCas());
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
}
JCasBuilder doc = new JCasBuilder(aJCas);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
// markers following each other.
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
int surfaceBegin = -1;
int surfaceEnd = -1;
String surfaceString = null;
// Tokens, Lemma, POS
Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
for (String[] word : words) {
if (word[ID].contains("-")) {
String[] fragments = word[ID].split("-");
surfaceBegin = Integer.valueOf(fragments[0]);
surfaceEnd = Integer.valueOf(fragments[1]);
surfaceString = word[FORM];
continue;
}
// Read token
int tokenIdx = Integer.valueOf(word[ID]);
Token token = doc.add(word[FORM], Token.class);
tokens.put(tokenIdx, token);
if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
doc.add(" ");
}
// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
}
// Read part-of-speech tag
if (!UNUSED.equals(word[POSTAG]) && readPos) {
Type posTag = posMappingProvider.getTagType(word[POSTAG]);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
pos.setPosValue(word[POSTAG]);
pos.addToIndexes();
token.setPos(pos);
}
// Read morphological features
if (!UNUSED.equals(word[FEATS]) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
morphtag.setValue(word[FEATS]);
morphtag.addToIndexes();
token.setMorph(morphtag);
// Try parsing out individual feature values. Since the DKPro Core
// MorphologicalFeatures type is based on the definition from the UD project,
// we can do this rather straightforwardly.
Type morphType = morphtag.getType();
String[] items = word[FEATS].split("\\|");
for (String item : items) {
String[] keyValue = item.split("=");
StringBuilder key = new StringBuilder(keyValue[0]);
key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
String value = keyValue[1];
Feature feat = morphType.getFeatureByBaseName(key.toString());
if (feat != null) {
morphtag.setStringValue(feat, value);
}
}
}
// Read surface form
if (tokenIdx == surfaceEnd) {
int begin = tokens.get(surfaceBegin).getBegin();
int end = tokens.get(surfaceEnd).getEnd();
SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
surfaceForm.setValue(surfaceString);
surfaceForm.addToIndexes();
surfaceBegin = -1;
surfaceEnd = -1;
surfaceString = null;
}
sentenceEnd = token.getEnd();
}
// Dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// Model the root as a loop onto itself
makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
}
if (!UNUSED.equals(word[DEPS])) {
// list items separated by vertical bar
String[] items = word[DEPS].split("\\|");
for (String item : items) {
String[] sItem = item.split(":");
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(sItem[0]);
makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
}
}
}
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma in project webanno by webanno.
the class TcfReader method convertLemma.
private void convertLemma(JCas aJCas, TextCorpus aCorpusData, Map<String, Token> aTokens) {
if (aCorpusData.getLemmasLayer() == null) {
return;
}
for (int i = 0; i < aCorpusData.getLemmasLayer().size(); i++) {
eu.clarin.weblicht.wlfxb.tc.api.Token[] lemmaTokens = aCorpusData.getLemmasLayer().getTokens(aCorpusData.getLemmasLayer().getLemma(i));
String value = aCorpusData.getLemmasLayer().getLemma(i).getString();
Lemma outLemma = new Lemma(aJCas);
outLemma.setBegin(aTokens.get(lemmaTokens[0].getID()).getBegin());
outLemma.setEnd(aTokens.get(lemmaTokens[0].getID()).getEnd());
outLemma.setValue(value);
outLemma.addToIndexes();
// Set the lemma to the token
aTokens.get(lemmaTokens[0].getID()).setLemma(outLemma);
}
}
Aggregations