use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class TcfWriter method writeSentence.
private void writeSentence(JCas aJCas, TextCorpus aTextCorpus, Map<Integer, eu.clarin.weblicht.wlfxb.tc.api.Token> aTokensBeginPositionMap) {
// if not TCF file, add sentence layer (Sentence is required for BRAT)
SentencesLayer sentencesLayer = aTextCorpus.getSentencesLayer();
if (sentencesLayer != null) {
getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: found");
return;
}
sentencesLayer = aTextCorpus.createSentencesLayer();
getLogger().debug("Layer [" + TextCorpusLayerTag.SENTENCES.getXmlName() + "]: created");
for (Sentence sentence : select(aJCas, Sentence.class)) {
List<eu.clarin.weblicht.wlfxb.tc.api.Token> tokens = new ArrayList<>();
for (Token token : selectCovered(Token.class, sentence)) {
tokens.add(aTokensBeginPositionMap.get(token.getBegin()));
}
sentencesLayer.addSentence(tokens);
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebannoTsv2Reader method setAnnotations.
/**
* Iterate through lines and create span annotations accordingly. For multiple span annotation,
* based on the position of the annotation in the line, update only the end position of the
* annotation
*/
private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException {
// getting header information
LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
// token number + token columns (minimum required)
int columns = 1;
int tokenStart = 0, sentenceStart = 0;
Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<>();
Map<Type, Type> relationayers = new LinkedHashMap<>();
// an annotation for every feature in a layer
Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<>();
// store if this is a Begin/Intermediate/End of an annotation
Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<>();
// Store annotations of tokens so that it can be used later for relation
// annotations
Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<>();
// store target token ids used for a relation
Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<>();
// store tokens indexing with the concat of itsbegin-end so that lemma
// and pos annotation
// can be attached, if exists, later
indexedTokens = new HashMap<>();
while (lineIterator.hasNext()) {
String line = lineIterator.next().trim();
if (line.trim().equals("") && sentenceStart == tokenStart) {
continue;
}
if (line.trim().equals("")) {
text.replace(tokenStart - 1, tokenStart, "");
tokenStart = tokenStart - 1;
Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
sentence.addToIndexes();
tokenStart++;
sentenceStart = tokenStart;
text.append("\n");
continue;
}
// sentence
if (line.startsWith("#text=")) {
continue;
}
if (line.startsWith("#id=")) {
// it is a comment line
continue;
}
if (line.startsWith("#")) {
columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line);
continue;
}
// so skip such lines
if (!Character.isDigit(line.split(" ")[0].charAt(0))) {
continue;
}
// a token number, check if it didn't in the format NUM-NUM
if (!Character.isDigit(line.split("-")[1].charAt(0))) {
continue;
}
int count = StringUtils.countMatches(line, "\t");
if (columns != count) {
throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
}
// adding tokens and sentence
StringTokenizer lineTk = new StringTokenizer(line, "\t");
String tokenNumberColumn = lineTk.nextToken();
String tokenColumn = lineTk.nextToken();
Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length());
token.addToIndexes();
Type posType = JCasUtil.getType(aJcas, POS.class);
Type lemmaType = JCasUtil.getType(aJcas, Lemma.class);
if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) {
indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token);
}
// adding the annotations
createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn);
tokenStart = tokenStart + tokenColumn.length() + 1;
text.append(tokenColumn).append(" ");
}
if (tokenStart > sentenceStart) {
Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
sentence.addToIndexes();
text.append("\n");
}
createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets);
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebannoTsv2Writer method convertToTsv.
private void convertToTsv(JCas aJCas, OutputStream aOs, String aEncoding) throws IOException, ResourceInitializationException, CASRuntimeException, CASException {
LowLevelCAS llCas = aJCas.getLowLevelCas();
tokenIds = new HashMap<>();
setTokenId(aJCas, tokenIds);
tokenPositions = new TreeMap<>();
setTokenPosition(aJCas, tokenPositions);
Map<Integer, Integer> getTokensPerSentence = new TreeMap<>();
setTokenSentenceAddress(aJCas, getTokensPerSentence);
// list of annotation types
Set<Type> allTypes = new LinkedHashSet<>();
for (Annotation a : select(aJCas, Annotation.class)) {
if (!(a instanceof Token || a instanceof Sentence || a instanceof DocumentMetaData || a instanceof TagsetDescription || a instanceof CoreferenceLink)) {
allTypes.add(a.getType());
}
}
Set<Type> relationTypes = new LinkedHashSet<>();
// get all arc types
for (Type type : allTypes) {
if (type.getFeatures().size() == 0) {
continue;
}
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
relationTypes.add(type);
break;
}
}
}
allTypes.removeAll(relationTypes);
// relation annotations
Map<Type, String> relationTypesMap = new HashMap<>();
for (Type type : relationTypes) {
if (type.getName().equals(Dependency.class.getName())) {
relationTypesMap.put(type, POS.class.getName());
continue;
}
for (AnnotationFS anno : CasUtil.select(aJCas.getCas(), type)) {
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
relationTypesMap.put(type, anno.getFeatureValue(feature).getType().getName());
}
}
}
}
// all span annotation first
Map<Feature, Type> spanFeatures = new LinkedHashMap<>();
allTypes: for (Type type : allTypes) {
if (type.getFeatures().size() == 0) {
continue;
}
for (Feature feature : type.getFeatures()) {
// coreference annotation not supported
if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue allTypes;
}
}
IOUtils.write(" # " + type.getName(), aOs, aEncoding);
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
continue;
}
spanFeatures.put(feature, type);
IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
}
}
// write all relation annotation first
Set<Feature> relationFeatures = new LinkedHashSet<>();
for (Type type : relationTypes) {
IOUtils.write(" # " + type.getName(), aOs, aEncoding);
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
continue;
}
relationFeatures.add(feature);
IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
}
// Add the attach type for the realtion anotation
IOUtils.write(" | AttachTo=" + relationTypesMap.get(type), aOs, aEncoding);
}
IOUtils.write("\n", aOs, aEncoding);
Map<Feature, Map<Integer, String>> allAnnos = new HashMap<>();
allTypes: for (Type type : allTypes) {
for (Feature feature : type.getFeatures()) {
// coreference annotation not supported
if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue allTypes;
}
}
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
continue;
}
Map<Integer, String> tokenAnnoMap = new TreeMap<>();
setTokenAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
allAnnos.put(feature, tokenAnnoMap);
}
}
// get tokens where dependents are drown to
Map<Feature, Map<Integer, String>> relAnnos = new HashMap<>();
for (Type type : relationTypes) {
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
continue;
}
Map<Integer, String> tokenAnnoMap = new HashMap<>();
setRelationFeatureAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
relAnnos.put(feature, tokenAnnoMap);
}
}
// get tokens where dependents are drown from - the governor
Map<Type, Map<Integer, String>> governorAnnos = new HashMap<>();
for (Type type : relationTypes) {
Map<Integer, String> govAnnoMap = new HashMap<>();
setRelationGovernorPos(aJCas.getCas(), govAnnoMap, type);
governorAnnos.put(type, govAnnoMap);
}
int sentId = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
IOUtils.write("#id=" + sentId++ + "\n", aOs, aEncoding);
IOUtils.write("#text=" + sentence.getCoveredText().replace("\n", "") + "\n", aOs, aEncoding);
for (Token token : selectCovered(Token.class, sentence)) {
IOUtils.write(tokenIds.get(llCas.ll_getFSRef(token)) + "\t" + token.getCoveredText() + "\t", aOs, aEncoding);
// all span annotations on this token
for (Feature feature : spanFeatures.keySet()) {
String annos = allAnnos.get(feature).get(llCas.ll_getFSRef(token));
if (annos == null) {
if (multipleSpans.contains(spanFeatures.get(feature).getName())) {
IOUtils.write("O\t", aOs, aEncoding);
} else {
IOUtils.write("_\t", aOs, aEncoding);
}
} else {
IOUtils.write(annos + "\t", aOs, aEncoding);
}
}
for (Type type : relationTypes) {
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
continue;
}
String annos = relAnnos.get(feature).get(llCas.ll_getFSRef(token));
if (annos == null) {
IOUtils.write("_\t", aOs, aEncoding);
} else {
IOUtils.write(annos + "\t", aOs, aEncoding);
}
}
// the governor positions
String govPos = governorAnnos.get(type).get(llCas.ll_getFSRef(token));
if (govPos == null) {
IOUtils.write("_\t", aOs, aEncoding);
} else {
IOUtils.write(governorAnnos.get(type).get(llCas.ll_getFSRef(token)) + "\t", aOs, aEncoding);
}
}
IOUtils.write("\n", aOs, aEncoding);
}
IOUtils.write("\n", aOs, aEncoding);
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebannoTsv2Writer method setTokenId.
private void setTokenId(JCas aJCas, Map<Integer, String> aTokenAddress) {
LowLevelCAS llCas = aJCas.getLowLevelCas();
int sentenceId = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
int tokenId = 1;
for (Token token : selectCovered(Token.class, sentence)) {
aTokenAddress.put(llCas.ll_getFSRef(token), sentenceId + "-" + tokenId++);
}
sentenceId++;
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class ConstraintsGeneratorTest method makeJCasOneSentence.
private JCas makeJCasOneSentence() throws UIMAException {
TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
TypeSystemDescription local = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("src/test/resources/desc/types/webannoTestTypes.xml");
TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
JCas jcas = JCasFactory.createJCas(merged);
DocumentMetaData.create(jcas).setDocumentId("doc");
TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class, Sentence.class);
tb.buildTokens(jcas, "This is a test .");
return jcas;
}
Aggregations