use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Reader method createTokens.
private AnnotationUnit createTokens(JCas aJCas, String[] lines, int begin, int end) {
// subtokens should not be consider as tokens. example 1-2.1 ==> subtoken under token 2
if (!lines[0].contains(".")) {
Token token = new Token(aJCas, begin, end);
AnnotationUnit unit = new AnnotationUnit(begin, end, false, "");
units.add(unit);
token.addToIndexes();
token2Units.put(lines[0], unit);
units2Tokens.put(unit, token);
return unit;
} else {
AnnotationUnit unit = new AnnotationUnit(begin, end, true, "");
units.add(unit);
token2Units.put(lines[0], unit);
return unit;
}
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Reader method addAnnotationWithNoFeature.
private int addAnnotationWithNoFeature(JCas aJCas, Type aType, AnnotationUnit aUnit, List<AnnotationFS> aAnnos, Map<AnnotationUnit, Map<Integer, AnnotationFS>> aMultiTokUnits, int aEnd, int aRef) {
String anno = annotationsPerPostion.get(aType).get(aUnit).get(0);
if (!anno.equals("_")) {
int i = 0;
String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|");
for (String mAnnos : anno.split(stackedAnnoRegex)) {
String multipleSlotAnno = "(?<!\\\\)" + Pattern.quote(";");
for (String mAnno : mAnnos.split(multipleSlotAnno)) {
String depRef = "";
if (mAnno.endsWith("]")) {
depRef = mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1);
aRef = depRef.contains("_") ? 0 : Integer.valueOf(mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1));
mAnno = mAnno.substring(0, mAnno.indexOf("["));
}
boolean isMultitoken = false;
AnnotationFS multiAnnoFs = null;
if (!aMultiTokUnits.isEmpty()) {
for (AnnotationUnit u : aMultiTokUnits.keySet()) {
for (Integer r : aMultiTokUnits.get(u).keySet()) {
if (aRef == r) {
isMultitoken = true;
multiAnnoFs = aMultiTokUnits.get(u).get(r);
break;
}
}
}
}
if (isMultitoken) {
Feature endF = aType.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END);
multiAnnoFs.setIntValue(endF, aEnd);
setAnnoRefPerUnit(aUnit, aType, aRef, multiAnnoFs);
} else {
aMultiTokUnits.putIfAbsent(aUnit, new HashMap<>());
aMultiTokUnits.get(aUnit).put(aRef, aAnnos.get(i));
aJCas.addFsToIndexes(aAnnos.get(i));
setAnnoRefPerUnit(aUnit, aType, aRef, aAnnos.get(i));
}
aRef++;
}
i++;
}
}
return aRef;
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Writer method setRelationAnnotation.
private void setRelationAnnotation(JCas aJCas) {
for (String l : relationLayers) {
if (l.equals(Token.class.getName())) {
continue;
}
Map<AnnotationUnit, List<List<String>>> annotationsPertype;
if (annotationsPerPostion.get(l) == null) {
annotationsPertype = new HashMap<>();
} else {
annotationsPertype = annotationsPerPostion.get(l);
}
Type type = getType(aJCas.getCas(), l);
Feature dependentFeature = null;
Feature governorFeature = null;
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(DEPENDENT)) {
// check if the dependent is
dependentFeature = feature;
}
if (feature.getShortName().equals(GOVERNOR)) {
governorFeature = feature;
}
}
for (AnnotationFS fs : CasUtil.select(aJCas.getCas(), type)) {
AnnotationFS depFs = (AnnotationFS) fs.getFeatureValue(dependentFeature);
AnnotationFS govFs = (AnnotationFS) fs.getFeatureValue(governorFeature);
Type govType = govFs.getType();
AnnotationUnit govUnit = getFirstUnit(getUnit(govFs.getBegin(), govFs.getEnd(), govFs.getCoveredText()));
if (ambigUnits.get(govType.getName()).get(govUnit) == null) {
govUnit = getUnit(govFs.getBegin(), govFs.getEnd(), govFs.getCoveredText());
}
AnnotationUnit depUnit = getFirstUnit(getUnit(depFs.getBegin(), depFs.getEnd(), depFs.getCoveredText()));
if (ambigUnits.get(govType.getName()).get(depUnit) == null) {
depUnit = getUnit(depFs.getBegin(), depFs.getEnd(), depFs.getCoveredText());
}
if (type.getName().equals(Dependency.class.getName())) {
govType = aJCas.getCas().getTypeSystem().getType(POS.class.getName());
}
int govRef = 0;
int depRef = 0;
// The WebAnno world do not ever process Token as an annotation
if (!govType.getName().equals(Token.class.getName()) && ambigUnits.get(govType.getName()).get(govUnit).equals(true)) {
govRef = annotaionRefPerType.get(govType).get(govFs);
}
if (!govType.getName().equals(Token.class.getName()) && ambigUnits.get(govType.getName()).get(depUnit).equals(true)) {
depRef = annotaionRefPerType.get(govType).get(depFs);
}
setRelationAnnoPerFeature(annotationsPertype, type, fs, depUnit, govUnit, govRef, depRef, govType);
}
if (annotationsPertype.keySet().size() > 0) {
annotationsPerPostion.put(l, annotationsPertype);
}
}
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Writer method getSubUnits.
private Set<AnnotationUnit> getSubUnits(SubTokenAnno aSTA, Set<AnnotationUnit> aSubUnits) {
AnnotationUnit prevUnit = null;
List<AnnotationUnit> tmpUnits = new ArrayList<>(units);
if (aSTA.getBegin() == aSTA.getEnd()) {
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, "");
for (AnnotationUnit unit : units) {
if (unit.begin >= newUnit.begin && unit.end >= newUnit.end) {
updateUnitLists(tmpUnits, unit, newUnit);
aSubUnits.add(newUnit);
units = new ArrayList<>(tmpUnits);
return aSubUnits;
}
}
}
for (AnnotationUnit unit : units) {
if (unit.end > aSTA.end) {
if (unit.begin == aSTA.begin) {
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, aSTA.getText());
updateUnitLists(tmpUnits, unit, newUnit);
aSubUnits.add(newUnit);
}
break;
}
// this is a sub-token annotation
if (unit.begin <= aSTA.getBegin() && aSTA.getBegin() <= unit.end && aSTA.getEnd() <= unit.end) {
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, aSTA.getText());
updateUnitLists(tmpUnits, unit, newUnit);
aSubUnits.add(newUnit);
} else // if sub-token annotation crosses multiple tokens
if ((unit.begin <= aSTA.getBegin() && aSTA.getBegin() < unit.end && aSTA.getEnd() > unit.end)) {
int thisSubTextLen = unit.end - aSTA.begin;
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), unit.end, false, aSTA.getText().substring(0, thisSubTextLen));
aSubUnits.add(newUnit);
updateUnitLists(tmpUnits, unit, newUnit);
aSTA.setBegin(getNextUnitBegin(aSTA.getBegin()));
aSTA.setText(aSTA.getText().trim().substring(thisSubTextLen));
getSubUnits(aSTA, aSubUnits);
} else // empty annotation between tokens
if (aSTA.getBegin() <= unit.begin && prevUnit != null && prevUnit.end < unit.begin) {
int thisSubTextLen = unit.begin - aSTA.begin;
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), unit.begin, false, aSTA.getText().substring(0, thisSubTextLen));
aSubUnits.add(newUnit);
updateUnitLists(tmpUnits, prevUnit, newUnit);
aSTA.setBegin(unit.begin);
aSTA.setText(aSTA.getText().trim().substring(thisSubTextLen));
getSubUnits(aSTA, aSubUnits);
} else {
prevUnit = unit;
}
}
units = new ArrayList<>(tmpUnits);
return aSubUnits;
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Writer method setSpanAnnoPerFeature.
private void setSpanAnnoPerFeature(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType, AnnotationFS aFs, AnnotationUnit aUnit, boolean aIsMultiToken, boolean aIsFirst) {
List<String> annoPerFeatures = new ArrayList<>();
featurePerLayer.putIfAbsent(aType.getName(), new LinkedHashSet<>());
int ref = getRefId(aType, aFs, aUnit);
if (ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)) != null && ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)).equals(false)) {
ref = 0;
}
if (ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)) == null && ambigUnits.get(aType.getName()).get(aUnit).equals(false)) {
ref = 0;
}
for (Feature feature : aType.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT) || feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue;
}
// if slot feature
if (slotFeatures != null && slotFeatures.contains(feature.getName())) {
if (aFs.getFeatureValue(feature) != null) {
ArrayFS array = (ArrayFS) aFs.getFeatureValue(feature);
StringBuilder sbRole = new StringBuilder();
StringBuilder sbTarget = new StringBuilder();
for (FeatureStructure linkFS : array.toArray()) {
String role = linkFS.getStringValue(linkFS.getType().getFeatureByBaseName("role"));
AnnotationFS targetFs = (AnnotationFS) linkFS.getFeatureValue(linkFS.getType().getFeatureByBaseName("target"));
Type tType = targetFs.getType();
AnnotationUnit firstUnit = getFirstUnit(targetFs);
ref = getRefId(tType, targetFs, firstUnit);
// Check if the target is ambiguous or not
if (ambigUnits.get(tType.getName()).get(firstUnit).equals(false)) {
ref = 0;
}
if (role == null) {
role = "*";
} else {
// Escape special character
role = replaceEscapeChars(role);
}
if (sbRole.length() < 1) {
sbRole.append(role);
// record the actual target type column number if slot target is
// uima.tcas.Annotation
int targetTypeNumber = 0;
if (slotFeatureTypes.get(feature).getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
targetTypeNumber = layerMaps.get(tType);
}
sbTarget.append(unitsLineNumber.get(firstUnit)).append(targetTypeNumber == 0 ? "" : "-" + targetTypeNumber).append(ref > 0 ? "[" + ref + "]" : "");
} else {
sbRole.append(";");
sbTarget.append(";");
sbRole.append(role);
int targetTypeNumber = 0;
if (slotFeatureTypes.get(feature).getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
targetTypeNumber = layerMaps.get(tType);
}
sbTarget.append(unitsLineNumber.get(firstUnit)).append(targetTypeNumber == 0 ? "" : "-" + targetTypeNumber).append(ref > 0 ? "[" + ref + "]" : "");
}
}
annoPerFeatures.add(sbRole.toString().isEmpty() ? "_" : sbRole.toString());
annoPerFeatures.add(sbTarget.toString().isEmpty() ? "_" : sbTarget.toString());
} else {
// setting it to null
annoPerFeatures.add("_");
annoPerFeatures.add("_");
}
featurePerLayer.get(aType.getName()).add(ROLE + feature.getName() + "_" + slotLinkTypes.get(feature.getName()));
featurePerLayer.get(aType.getName()).add(slotFeatureTypes.get(feature).getName());
} else {
String annotation = aFs.getFeatureValueAsString(feature);
if (annotation == null) {
annotation = "*";
} else {
// Escape special character
annotation = replaceEscapeChars(annotation);
}
annotation = annotation + (ref > 0 ? "[" + ref + "]" : "");
// only add BIO markers to multiple annotations
setAnnoFeature(aIsMultiToken, aIsFirst, annoPerFeatures, annotation);
featurePerLayer.get(aType.getName()).add(feature.getShortName());
}
}
aAnnotationsPertype.putIfAbsent(aUnit, new ArrayList<>());
// If the layer do not have a feature at all, add dummy * as a place holder
if (annoPerFeatures.size() == 0) {
setAnnoFeature(aIsMultiToken, aIsFirst, annoPerFeatures, "*" + (ref > 0 ? "[" + ref + "]" : ""));
}
aAnnotationsPertype.get(aUnit).add(annoPerFeatures);
}
Aggregations