use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken in project webanno by webanno.
the class Tsv3XCasDocumentBuilder method of.
public static TsvDocument of(TsvSchema aSchema, JCas aJCas) {
TsvFormatHeader format = new TsvFormatHeader("WebAnno TSV", "3.2");
TsvDocument doc = new TsvDocument(format, aSchema, aJCas);
// Fill document with all the sentences and tokens
for (Sentence uimaSentence : select(aJCas, Sentence.class)) {
TsvSentence sentence = doc.createSentence(uimaSentence);
for (Token uimaToken : selectCovered(Token.class, uimaSentence)) {
sentence.createToken(uimaToken);
}
}
// Scan for chains
for (Type headType : aSchema.getChainHeadTypes()) {
for (FeatureStructure chainHead : CasUtil.selectFS(aJCas.getCas(), headType)) {
List<AnnotationFS> elements = new ArrayList<>();
AnnotationFS link = getFeature(chainHead, CHAIN_FIRST_FEAT, AnnotationFS.class);
while (link != null) {
elements.add(link);
link = getFeature(link, CHAIN_NEXT_FEAT, AnnotationFS.class);
}
if (!elements.isEmpty()) {
Type elementType = headType.getFeatureByBaseName(CHAIN_FIRST_FEAT).getRange();
doc.createChain(headType, elementType, elements);
}
}
}
// Build indexes over the token start and end positions such that we can quickly locate
// tokens based on their offsets.
NavigableMap<Integer, TsvToken> tokenBeginIndex = new TreeMap<>();
NavigableMap<Integer, TsvToken> tokenEndIndex = new TreeMap<>();
List<TsvToken> tokens = new ArrayList<>();
for (TsvSentence sentence : doc.getSentences()) {
for (TsvToken token : sentence.getTokens()) {
tokenBeginIndex.put(token.getBegin(), token);
tokenEndIndex.put(token.getEnd(), token);
tokens.add(token);
}
}
// units.
for (Type type : aSchema.getUimaTypes()) {
LayerType layerType = aSchema.getLayerType(type);
boolean addDisambiguationIdIfStacked = SPAN.equals(layerType);
for (AnnotationFS annotation : CasUtil.select(aJCas.getCas(), type)) {
doc.activateType(annotation.getType());
// Get the relevant begin and end offsets for the current annotation
int begin = annotation.getBegin();
int end = annotation.getEnd();
// to be sure.
if (RELATION.equals(layerType)) {
AnnotationFS targetFS = getFeature(annotation, FEAT_REL_TARGET, AnnotationFS.class);
begin = targetFS.getBegin();
end = targetFS.getEnd();
}
TsvToken beginToken = tokenBeginIndex.floorEntry(begin).getValue();
TsvToken endToken = tokenEndIndex.ceilingEntry(end).getValue();
// value obtained from the tokenBeginIndex.
if (begin == end) {
beginToken = endToken;
}
boolean singleToken = beginToken == endToken;
boolean zeroWitdh = begin == end;
boolean multiTokenCapable = SPAN.equals(layerType) || CHAIN.equals(layerType);
// in either case.
if (beginToken.getBegin() == begin && endToken.getEnd() == end) {
doc.mapFS2Unit(annotation, beginToken);
beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
if (multiTokenCapable) {
endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
}
} else if (zeroWitdh) {
TsvSubToken t = beginToken.createSubToken(begin, min(beginToken.getEnd(), end));
doc.mapFS2Unit(annotation, t);
t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
} else {
// the annotation.
if (beginToken.getBegin() < begin) {
TsvSubToken t = beginToken.createSubToken(begin, min(beginToken.getEnd(), end));
doc.mapFS2Unit(annotation, t);
t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
} else // If not the sub-token is ID-defining, then the begin token is ID-defining
{
beginToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
doc.mapFS2Unit(annotation, beginToken);
}
// checking if if singleToke is true.
if (endToken.getEnd() > end) {
TsvSubToken t = endToken.createSubToken(max(endToken.getBegin(), begin), end);
t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
if (!singleToken) {
doc.mapFS2Unit(annotation, t);
}
} else if (!singleToken && multiTokenCapable) {
endToken.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
}
}
// the end token
if (multiTokenCapable && !singleToken) {
ListIterator<TsvToken> i = tokens.listIterator(tokens.indexOf(beginToken));
TsvToken t;
while ((t = i.next()) != endToken) {
if (t != beginToken) {
t.addUimaAnnotation(annotation, addDisambiguationIdIfStacked);
}
}
}
// Multi-token span annotations must get a disambiguation ID
if (SPAN.equals(layerType) && !singleToken) {
doc.addDisambiguationId(annotation);
}
}
}
// Scan all created units to see which columns actually contains values
for (TsvSentence sentence : doc.getSentences()) {
for (TsvToken token : sentence.getTokens()) {
scanUnitForActiveColumns(token);
scanUnitForAmbiguousSlotReferences(token);
for (TsvSubToken subToken : token.getSubTokens()) {
scanUnitForActiveColumns(subToken);
scanUnitForAmbiguousSlotReferences(subToken);
}
}
}
// Activate the placeholder columns for any active types for which no other columns are
// active.
Set<Type> activeTypesNeedingPlaceholders = new HashSet<>(doc.getActiveTypes());
for (TsvColumn col : doc.getActiveColumns()) {
activeTypesNeedingPlaceholders.remove(col.uimaType);
}
for (TsvColumn col : doc.getSchema().getColumns()) {
if (PLACEHOLDER.equals(col.featureType) && activeTypesNeedingPlaceholders.contains(col.uimaType)) {
doc.activateColumn(col);
}
}
return doc;
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken in project webanno by webanno.
the class Tsv3XDeserializer method readContent.
private void readContent(LineNumberReader aIn, TsvDocument aDoc) throws IOException {
StringBuilder text = new StringBuilder();
State prevState = State.INTER_SENTENCE_SPACE;
State state = State.INTER_SENTENCE_SPACE;
StringBuilder sentenceText = new StringBuilder();
TsvSentence prevSentence = null;
TsvSentence sentence = null;
TsvToken token = null;
String line = aIn.readLine();
while (!State.END.equals(state)) {
// These variables are only used in TOKEN and SUBTOKEN states.
String[] fields = null;
String id = null;
String[] offsets = null;
int begin = -1;
int end = -1;
// Determine the status of the current line
if (startsWith(line, PREFIX_TEXT)) {
state = State.SENTENCE;
} else if (line == null) {
state = State.END;
} else if (isEmpty(line)) {
state = State.INTER_SENTENCE_SPACE;
} else {
fields = splitPreserveAllTokens(line, FIELD_SEPARATOR);
// Get token metadata
id = fields[0];
offsets = split(fields[1], "-");
begin = Integer.valueOf(offsets[0]);
end = Integer.valueOf(offsets[1]);
// TOKEN or SUBTOKEN?
if (id.contains(".")) {
state = State.SUBTOKEN;
} else {
state = State.TOKEN;
}
}
// Assert that the order of information in the file is correct
switch(prevState) {
case INTER_SENTENCE_SPACE:
if (!State.SENTENCE.equals(state)) {
throw new IOException("Line " + aIn.getLineNumber() + ": Expected sentence header but got [" + state + "]");
}
break;
case SENTENCE:
if (!(State.SENTENCE.equals(state) || State.TOKEN.equals(state))) {
throw new IOException("Line " + aIn.getLineNumber() + ": Expected sentence header or token but got [" + state + "]");
}
break;
case TOKEN:
case SUBTOKEN:
if (!(State.INTER_SENTENCE_SPACE.equals(state) || State.END.equals(state) || State.TOKEN.equals(state) || State.SUBTOKEN.equals(state))) {
throw new IOException("Line " + aIn.getLineNumber() + ": Expected token, sub-token or sentence break but got [" + state + "]");
}
break;
}
// Do the actual parsing
switch(state) {
case END:
case INTER_SENTENCE_SPACE:
// End of sentence action
// The -1 here is to account for the tailing line break
sentence.getUimaSentence().setEnd(text.length() - 1);
sentence.getUimaSentence().addToIndexes();
prevSentence = sentence;
sentence = null;
break;
case TOKEN:
// End of sentence header action
if (State.SENTENCE.equals(prevState)) {
// last sentence!
if (text.length() > begin) {
assert text.length() == begin + 1;
assert text.charAt(text.length() - 1) == LINE_BREAK;
text.setLength(text.length() - 1);
}
// the gap.
if (text.length() < begin) {
text.append(repeat(' ', begin - text.length()));
}
assert text.length() == begin;
assert sentence == null;
Sentence uimaSentence = new Sentence(aDoc.getJCas());
uimaSentence.setBegin(text.length());
sentence = aDoc.createSentence(uimaSentence);
text.append(sentenceText);
sentenceText.setLength(0);
}
// Token parsing action
Token uimaToken = new Token(aDoc.getJCas(), begin, end);
uimaToken.addToIndexes();
token = sentence.createToken(uimaToken);
// Read annotations from the columns
parseAnnotations(aDoc, sentence, token, fields);
break;
case SUBTOKEN:
// Read annotations from the columns
TsvSubToken subToken = token.createSubToken(begin, end);
parseAnnotations(aDoc, sentence, subToken, fields);
break;
case SENTENCE:
// Header parsing action
String textFragment = substringAfter(line, "=");
textFragment = unescapeText(aDoc.getFormatHeader(), textFragment);
sentenceText.append(textFragment);
sentenceText.append(LINE_BREAK);
break;
}
prevState = state;
line = aIn.readLine();
}
aDoc.getJCas().setDocumentText(text.toString());
// After all data has been read, we also add the annotations with disambiguation ID to
// the CAS indexes. This ensures we only add them after their final begin/end offsets
// have been determined since most of these annotations are actually multi-token
// annotations.
CAS cas = aDoc.getJCas().getCas();
Set<FeatureStructure> fses = new LinkedHashSet<>();
for (TsvSentence s : aDoc.getSentences()) {
for (TsvToken t : s.getTokens()) {
for (Type type : t.getUimaTypes()) {
fses.addAll(t.getUimaAnnotations(type));
}
for (TsvSubToken st : t.getSubTokens()) {
for (Type type : st.getUimaTypes()) {
fses.addAll(st.getUimaAnnotations(type));
}
}
}
}
fses.forEach(cas::addFsToIndexes);
}
use of de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.model.TsvSubToken in project webanno by webanno.
the class Tsv3XSerializer method write.
public void write(PrintWriter aOut, TsvSentence aSentence) {
String[] lines = splitPreserveAllTokens(aSentence.getUimaSentence().getCoveredText(), LINE_BREAK);
for (String line : lines) {
aOut.print(PREFIX_TEXT);
aOut.print(escapeText(line));
aOut.print(LINE_BREAK);
}
for (TsvToken token : aSentence.getTokens()) {
write(aOut, token);
aOut.write(LINE_BREAK);
for (TsvSubToken subToken : token.getSubTokens()) {
write(aOut, subToken);
aOut.write(LINE_BREAK);
}
}
}
Aggregations