use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Writer method addChinFeatureAnno.
/**
* @param aAnnotationsPertype
* store annotations per type associated with the annotation units
* @param aType
* the coreference annotation type
* @param aFs
* the feature structure
* @param aUnit
* the current annotation unit of the coreference chain
* @param aLinkNo
* a reference to the link in a chain, starting at one for the first link and n for
* the last link in the chain
* @param achainNo
* a reference to the chain, starting at 1 for the first chain and n for the last
* chain where n is the number of coreference chains the document
*/
private void addChinFeatureAnno(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType, AnnotationFS aFs, AnnotationUnit aUnit, int aLinkNo, int achainNo) {
featurePerLayer.putIfAbsent(aType.getName(), new LinkedHashSet<>());
// annotation is per Token
if (units.contains(aUnit)) {
setChainAnnoPerFeature(aAnnotationsPertype, aType, aFs, aUnit, aLinkNo, achainNo, false, false);
} else // Annotation is on sub-token or multiple tokens
{
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(aFs.getBegin());
sta.setEnd(aFs.getEnd());
sta.setText(aFs.getCoveredText());
boolean isMultiToken = isMultiToken(aFs);
boolean isFirst = true;
Set<AnnotationUnit> sus = new LinkedHashSet<>();
for (AnnotationUnit newUnit : getSubUnits(sta, sus)) {
setChainAnnoPerFeature(aAnnotationsPertype, aType, aFs, newUnit, aLinkNo, achainNo, isMultiToken, isFirst);
isFirst = false;
}
}
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Writer method getFirstUnit.
private AnnotationUnit getFirstUnit(AnnotationFS targetFs) {
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(targetFs.getBegin());
sta.setEnd(targetFs.getEnd());
sta.setText(targetFs.getCoveredText());
Set<AnnotationUnit> sus = new LinkedHashSet<>();
AnnotationUnit firstUnit = null;
for (AnnotationUnit u : getSubUnits(sta, sus)) {
firstUnit = u;
break;
}
return firstUnit;
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Reader method addAnnotations.
/**
* Importing span annotations including slot annotations.
*/
private void addAnnotations(JCas aJCas, Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> aAnnosPerTypePerUnit) {
for (Type type : annotationsPerPostion.keySet()) {
Map<AnnotationUnit, Map<Integer, AnnotationFS>> multiTokUnits = new HashMap<>();
int ref = 1;
// to see if it is on multiple token
AnnotationFS prevAnnoFs = null;
for (AnnotationUnit unit : annotationsPerPostion.get(type).keySet()) {
int end = unit.end;
List<AnnotationFS> annos = aAnnosPerTypePerUnit.get(type).get(unit);
int j = 0;
Feature linkeF = null;
Map<AnnotationFS, List<FeatureStructure>> linkFSesPerSlotAnno = new HashMap<>();
if (allLayers.get(type).size() == 0) {
ref = addAnnotationWithNoFeature(aJCas, type, unit, annos, multiTokUnits, end, ref);
continue;
}
for (Feature feat : allLayers.get(type)) {
String anno = annotationsPerPostion.get(type).get(unit).get(j);
if (!anno.equals("_")) {
int i = 0;
// if it is a slot annotation (multiple slots per
// single annotation
// (Target1<--role1--Base--role2-->Target2)
int slot = 0;
boolean targetAdd = false;
String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|");
String[] stackedAnnos = anno.split(stackedAnnoRegex);
for (String mAnnos : stackedAnnos) {
String multipleSlotAnno = "(?<!\\\\)" + Pattern.quote(";");
for (String mAnno : mAnnos.split(multipleSlotAnno)) {
String depRef = "";
String multSpliter = "(?<!\\\\)" + Pattern.quote("[");
// is this slot target ambiguous?
boolean ambigTarget = false;
if (mAnno.split(multSpliter).length > 1) {
ambigTarget = true;
depRef = mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1);
ref = depRef.contains("_") ? ref : Integer.valueOf(mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1));
mAnno = mAnno.substring(0, mAnno.indexOf("["));
}
if (mAnno.equals("*")) {
mAnno = null;
}
boolean isMultitoken = false;
if (!multiTokUnits.isEmpty() && prevAnnoFs != null && prevAnnoFs.getBegin() != unit.begin) {
contAnno: for (AnnotationUnit u : multiTokUnits.keySet()) {
for (Integer r : multiTokUnits.get(u).keySet()) {
if (ref == r) {
isMultitoken = true;
prevAnnoFs = multiTokUnits.get(u).get(r);
break contAnno;
}
}
}
}
if (isMultitoken) {
Feature endF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END);
prevAnnoFs.setIntValue(endF, end);
mAnno = getEscapeChars(mAnno);
prevAnnoFs.setFeatureValueFromString(feat, mAnno);
if (feat.getShortName().equals(REF_LINK)) {
// since REF_REL do not start with BIO,
// update it it...
annos.set(i, prevAnnoFs);
}
setAnnoRefPerUnit(unit, type, ref, prevAnnoFs);
} else {
if (roleLinks.containsKey(feat)) {
linkeF = feat;
FeatureStructure link = aJCas.getCas().createFS(slotLinkTypes.get(feat));
Feature roleFeat = link.getType().getFeatureByBaseName("role");
mAnno = getEscapeChars(mAnno);
link.setStringValue(roleFeat, mAnno);
linkFSesPerSlotAnno.putIfAbsent(annos.get(i), new ArrayList<>());
linkFSesPerSlotAnno.get(annos.get(i)).add(link);
} else if (roleTargets.containsKey(feat)) {
FeatureStructure link = linkFSesPerSlotAnno.get(annos.get(i)).get(slot);
int customTypeNumber = 0;
if (mAnno.split("-").length > 2) {
customTypeNumber = Integer.valueOf(mAnno.substring(mAnno.lastIndexOf("-") + 1));
mAnno = mAnno.substring(0, mAnno.lastIndexOf("-"));
}
AnnotationUnit targetUnit = token2Units.get(mAnno);
Type tType = null;
if (customTypeNumber == 0) {
tType = roleTargets.get(feat);
} else {
tType = layerMaps.get(customTypeNumber);
}
AnnotationFS targetFs;
if (ambigTarget) {
targetFs = annosPerRef.get(tType).get(targetUnit).get(ref);
} else {
targetFs = annosPerRef.get(tType).get(targetUnit).entrySet().iterator().next().getValue();
}
link.setFeatureValue(feat, targetFs);
addSlotAnnotations(linkFSesPerSlotAnno, linkeF);
targetAdd = true;
slot++;
} else if (feat.getShortName().equals(REF_REL)) {
int chainNo = Integer.valueOf(mAnno.split("->")[1].split("-")[0]);
int LinkNo = Integer.valueOf(mAnno.split("->")[1].split("-")[1]);
chainAnnosPerTyep.putIfAbsent(type, new TreeMap<>());
if (chainAnnosPerTyep.get(type).get(chainNo) != null && chainAnnosPerTyep.get(type).get(chainNo).get(LinkNo) != null) {
continue;
}
String refRel = mAnno.split("->")[0];
refRel = getEscapeChars(refRel);
if (refRel.equals("*")) {
refRel = null;
}
annos.get(i).setFeatureValueFromString(feat, refRel);
chainAnnosPerTyep.putIfAbsent(type, new TreeMap<>());
chainAnnosPerTyep.get(type).putIfAbsent(chainNo, new TreeMap<>());
chainAnnosPerTyep.get(type).get(chainNo).put(LinkNo, annos.get(i));
} else if (feat.getShortName().equals(REF_LINK)) {
mAnno = getEscapeChars(mAnno);
annos.get(i).setFeatureValueFromString(feat, mAnno);
aJCas.addFsToIndexes(annos.get(i));
} else if (depFeatures.get(type) != null && depFeatures.get(type).equals(feat)) {
int g = depRef.isEmpty() ? 0 : Integer.valueOf(depRef.split("_")[0]);
int d = depRef.isEmpty() ? 0 : Integer.valueOf(depRef.split("_")[1]);
Type depType = depTypess.get(type);
AnnotationUnit govUnit = token2Units.get(mAnno);
int l = annotationsPerPostion.get(type).get(unit).size();
String thisUnit = annotationsPerPostion.get(type).get(unit).get(l - 1);
AnnotationUnit depUnit = token2Units.get(thisUnit);
AnnotationFS govFs;
AnnotationFS depFs;
if (depType.getName().equals(POS.class.getName())) {
depType = aJCas.getCas().getTypeSystem().getType(Token.class.getName());
govFs = units2Tokens.get(govUnit);
depFs = units2Tokens.get(unit);
} else // in WebAnno world :)(!
if (depType.getName().equals(Token.class.getName())) {
govFs = units2Tokens.get(govUnit);
depFs = units2Tokens.get(unit);
} else if (g == 0 && d == 0) {
govFs = annosPerRef.get(depType).get(govUnit).entrySet().iterator().next().getValue();
depFs = annosPerRef.get(depType).get(depUnit).entrySet().iterator().next().getValue();
} else if (g == 0) {
govFs = annosPerRef.get(depType).get(govUnit).entrySet().iterator().next().getValue();
depFs = annosPerRef.get(depType).get(depUnit).get(d);
} else {
govFs = annosPerRef.get(depType).get(govUnit).get(g);
depFs = annosPerRef.get(depType).get(depUnit).entrySet().iterator().next().getValue();
}
annos.get(i).setFeatureValue(feat, depFs);
annos.get(i).setFeatureValue(type.getFeatureByBaseName(GOVERNOR), govFs);
if (depFs.getBegin() <= annos.get(i).getBegin()) {
Feature beginF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN);
annos.get(i).setIntValue(beginF, depFs.getBegin());
} else {
Feature endF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END);
annos.get(i).setIntValue(endF, depFs.getEnd());
}
aJCas.addFsToIndexes(annos.get(i));
} else {
mAnno = getEscapeChars(mAnno);
multiTokUnits.putIfAbsent(unit, new HashMap<>());
multiTokUnits.get(unit).put(ref, annos.get(i));
prevAnnoFs = annos.get(i);
annos.get(i).setFeatureValueFromString(feat, mAnno);
aJCas.addFsToIndexes(annos.get(i));
setAnnoRefPerUnit(unit, type, ref, annos.get(i));
}
}
if (stackedAnnos.length > 1) {
ref++;
}
}
if (type.getName().equals(POS.class.getName())) {
units2Tokens.get(unit).setPos((POS) annos.get(i));
}
if (type.getName().equals(Lemma.class.getName())) {
units2Tokens.get(unit).setLemma((Lemma) annos.get(i));
}
if (type.getName().equals(Stem.class.getName())) {
units2Tokens.get(unit).setStem((Stem) annos.get(i));
}
if (type.getName().equals(MorphologicalFeatures.class.getName())) {
units2Tokens.get(unit).setMorph((MorphologicalFeatures) annos.get(i));
}
i++;
}
if (targetAdd) {
linkFSesPerSlotAnno = new HashMap<>();
}
} else {
prevAnnoFs = null;
}
j++;
}
if (prevAnnoFs != null) {
ref++;
}
}
annosPerRef.put(type, multiTokUnits);
}
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Reader method setAnnotations.
/**
* Iterate through lines and create span annotations accordingly. For multiple span annotation,
* based on the position of the annotation in the line, update only the end position of the
* annotation
*/
private void setAnnotations(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
// getting header information
LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
int sentBegin = -1, sentEnd = 0;
int prevSentEnd = 0;
StringBuilder sentLineSb = new StringBuilder();
String lastSent = "";
int format = -1;
while (lineIterator.hasNext()) {
String line = lineIterator.next();
if (line.startsWith("#T_")) {
setLayerAndFeature(aJCas, line);
continue;
}
if (line.startsWith("#Text=")) {
String text = line.substring(line.indexOf("=") + 1);
if (format == 31) {
text = unescapeJava(text);
} else if (format == 32) {
text = unEscapeSpecial(text);
}
if (sentLineSb.toString().isEmpty()) {
sentLineSb.append(text);
} else {
sentLineSb.append(LF).append(text);
}
lastSent = sentLineSb.toString();
continue;
}
if (line.startsWith("#FORMAT=")) {
if ("#FORMAT=WebAnno TSV 3".equals(line)) {
format = 3;
} else if ("#FORMAT=WebAnno TSV 3.1".equals(line)) {
format = 31;
} else if ("#FORMAT=WebAnno TSV 3.2".equals(line)) {
format = 32;
}
continue;
}
if (line.trim().isEmpty()) {
if (!sentLineSb.toString().isEmpty()) {
createSentence(aJCas, sentLineSb.toString(), sentBegin, sentEnd, prevSentEnd);
prevSentEnd = sentEnd;
// reset for next sentence begin
sentBegin = -1;
sentLineSb = new StringBuilder();
}
continue;
}
line = line.trim();
int count = StringUtils.countMatches(line, "\t");
if (columns != count) {
throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
}
String regex = "(?<!\\\\)*" + Pattern.quote(TAB);
String[] lines = line.split(regex);
int begin = Integer.parseInt(lines[1].split("-")[0]);
int end = Integer.parseInt(lines[1].split("-")[1]);
if (sentBegin == -1) {
sentBegin = begin;
}
sentEnd = end;
AnnotationUnit unit = createTokens(aJCas, lines, begin, end);
int ind = 3;
setAnnosPerTypePerUnit(lines, unit, ind);
}
// the last sentence
if (!lastSent.isEmpty()) {
createSentence(aJCas, lastSent, sentBegin, sentEnd, prevSentEnd);
}
Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> annosPerTypePerUnit = new HashMap<>();
setAnnosPerUnit(aJCas, annosPerTypePerUnit);
addAnnotations(aJCas, annosPerTypePerUnit);
addChainAnnotations(aJCas);
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit in project webanno by webanno.
the class WebannoTsv3Reader method setAnnosPerUnit.
private void setAnnosPerUnit(JCas aJCas, Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> aAnnosPerTypePerUnit) {
for (Type type : annotationsPerPostion.keySet()) {
Map<AnnotationUnit, List<AnnotationFS>> annosPerUnit = new HashMap<>();
for (AnnotationUnit unit : annotationsPerPostion.get(type).keySet()) {
int begin = unit.begin;
int end = unit.end;
List<AnnotationFS> annos = new ArrayList<>();
// if there are multiple annos
int multAnnos = 1;
for (String anno : annotationsPerPostion.get(type).get(unit)) {
String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|");
if (anno.split(stackedAnnoRegex).length > multAnnos) {
multAnnos = anno.split(stackedAnnoRegex).length;
}
}
for (int i = 0; i < multAnnos; i++) {
annos.add(aJCas.getCas().createAnnotation(type, begin, end));
}
annosPerUnit.put(unit, annos);
}
aAnnosPerTypePerUnit.put(type, annosPerUnit);
}
}
Aggregations