use of org.apache.uima.cas.Type in project webanno by webanno.
the class ConllUReader method convert.
public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
if (readPos) {
try {
posMappingProvider.configure(aJCas.getCas());
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
}
JCasBuilder doc = new JCasBuilder(aJCas);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
// markers following each other.
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
int surfaceBegin = -1;
int surfaceEnd = -1;
String surfaceString = null;
// Tokens, Lemma, POS
Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
for (String[] word : words) {
if (word[ID].contains("-")) {
String[] fragments = word[ID].split("-");
surfaceBegin = Integer.valueOf(fragments[0]);
surfaceEnd = Integer.valueOf(fragments[1]);
surfaceString = word[FORM];
continue;
}
// Read token
int tokenIdx = Integer.valueOf(word[ID]);
Token token = doc.add(word[FORM], Token.class);
tokens.put(tokenIdx, token);
if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
doc.add(" ");
}
// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
}
// Read part-of-speech tag
if (!UNUSED.equals(word[POSTAG]) && readPos) {
Type posTag = posMappingProvider.getTagType(word[POSTAG]);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
pos.setPosValue(word[POSTAG]);
pos.addToIndexes();
token.setPos(pos);
}
// Read morphological features
if (!UNUSED.equals(word[FEATS]) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
morphtag.setValue(word[FEATS]);
morphtag.addToIndexes();
token.setMorph(morphtag);
// Try parsing out individual feature values. Since the DKPro Core
// MorphologicalFeatures type is based on the definition from the UD project,
// we can do this rather straightforwardly.
Type morphType = morphtag.getType();
String[] items = word[FEATS].split("\\|");
for (String item : items) {
String[] keyValue = item.split("=");
StringBuilder key = new StringBuilder(keyValue[0]);
key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
String value = keyValue[1];
Feature feat = morphType.getFeatureByBaseName(key.toString());
if (feat != null) {
morphtag.setStringValue(feat, value);
}
}
}
// Read surface form
if (tokenIdx == surfaceEnd) {
int begin = tokens.get(surfaceBegin).getBegin();
int end = tokens.get(surfaceEnd).getEnd();
SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
surfaceForm.setValue(surfaceString);
surfaceForm.addToIndexes();
surfaceBegin = -1;
surfaceEnd = -1;
surfaceString = null;
}
sentenceEnd = token.getEnd();
}
// Dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// Model the root as a loop onto itself
makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
}
if (!UNUSED.equals(word[DEPS])) {
// list items separated by vertical bar
String[] items = word[DEPS].split("\\|");
for (String item : items) {
String[] sItem = item.split(":");
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(sItem[0]);
makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
}
}
}
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
use of org.apache.uima.cas.Type in project webanno by webanno.
the class WebannoTsv2Reader method setAnnotations.
/**
* Iterate through lines and create span annotations accordingly. For multiple span annotation,
* based on the position of the annotation in the line, update only the end position of the
* annotation
*/
private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException {
// getting header information
LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
// token number + token columns (minimum required)
int columns = 1;
int tokenStart = 0, sentenceStart = 0;
Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<>();
Map<Type, Type> relationayers = new LinkedHashMap<>();
// an annotation for every feature in a layer
Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<>();
// store if this is a Begin/Intermediate/End of an annotation
Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<>();
// Store annotations of tokens so that it can be used later for relation
// annotations
Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<>();
// store target token ids used for a relation
Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<>();
// store tokens indexing with the concat of itsbegin-end so that lemma
// and pos annotation
// can be attached, if exists, later
indexedTokens = new HashMap<>();
while (lineIterator.hasNext()) {
String line = lineIterator.next().trim();
if (line.trim().equals("") && sentenceStart == tokenStart) {
continue;
}
if (line.trim().equals("")) {
text.replace(tokenStart - 1, tokenStart, "");
tokenStart = tokenStart - 1;
Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
sentence.addToIndexes();
tokenStart++;
sentenceStart = tokenStart;
text.append("\n");
continue;
}
// sentence
if (line.startsWith("#text=")) {
continue;
}
if (line.startsWith("#id=")) {
// it is a comment line
continue;
}
if (line.startsWith("#")) {
columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line);
continue;
}
// so skip such lines
if (!Character.isDigit(line.split(" ")[0].charAt(0))) {
continue;
}
// a token number, check if it didn't in the format NUM-NUM
if (!Character.isDigit(line.split("-")[1].charAt(0))) {
continue;
}
int count = StringUtils.countMatches(line, "\t");
if (columns != count) {
throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
}
// adding tokens and sentence
StringTokenizer lineTk = new StringTokenizer(line, "\t");
String tokenNumberColumn = lineTk.nextToken();
String tokenColumn = lineTk.nextToken();
Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length());
token.addToIndexes();
Type posType = JCasUtil.getType(aJcas, POS.class);
Type lemmaType = JCasUtil.getType(aJcas, Lemma.class);
if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) {
indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token);
}
// adding the annotations
createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn);
tokenStart = tokenStart + tokenColumn.length() + 1;
text.append(tokenColumn).append(" ");
}
if (tokenStart > sentenceStart) {
Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
sentence.addToIndexes();
text.append("\n");
}
createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets);
}
use of org.apache.uima.cas.Type in project webanno by webanno.
the class WebannoTsv2Writer method convertToTsv.
private void convertToTsv(JCas aJCas, OutputStream aOs, String aEncoding) throws IOException, ResourceInitializationException, CASRuntimeException, CASException {
LowLevelCAS llCas = aJCas.getLowLevelCas();
tokenIds = new HashMap<>();
setTokenId(aJCas, tokenIds);
tokenPositions = new TreeMap<>();
setTokenPosition(aJCas, tokenPositions);
Map<Integer, Integer> getTokensPerSentence = new TreeMap<>();
setTokenSentenceAddress(aJCas, getTokensPerSentence);
// list of annotation types
Set<Type> allTypes = new LinkedHashSet<>();
for (Annotation a : select(aJCas, Annotation.class)) {
if (!(a instanceof Token || a instanceof Sentence || a instanceof DocumentMetaData || a instanceof TagsetDescription || a instanceof CoreferenceLink)) {
allTypes.add(a.getType());
}
}
Set<Type> relationTypes = new LinkedHashSet<>();
// get all arc types
for (Type type : allTypes) {
if (type.getFeatures().size() == 0) {
continue;
}
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
relationTypes.add(type);
break;
}
}
}
allTypes.removeAll(relationTypes);
// relation annotations
Map<Type, String> relationTypesMap = new HashMap<>();
for (Type type : relationTypes) {
if (type.getName().equals(Dependency.class.getName())) {
relationTypesMap.put(type, POS.class.getName());
continue;
}
for (AnnotationFS anno : CasUtil.select(aJCas.getCas(), type)) {
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
relationTypesMap.put(type, anno.getFeatureValue(feature).getType().getName());
}
}
}
}
// all span annotation first
Map<Feature, Type> spanFeatures = new LinkedHashMap<>();
allTypes: for (Type type : allTypes) {
if (type.getFeatures().size() == 0) {
continue;
}
for (Feature feature : type.getFeatures()) {
// coreference annotation not supported
if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue allTypes;
}
}
IOUtils.write(" # " + type.getName(), aOs, aEncoding);
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
continue;
}
spanFeatures.put(feature, type);
IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
}
}
// write all relation annotation first
Set<Feature> relationFeatures = new LinkedHashSet<>();
for (Type type : relationTypes) {
IOUtils.write(" # " + type.getName(), aOs, aEncoding);
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
continue;
}
relationFeatures.add(feature);
IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
}
// Add the attach type for the realtion anotation
IOUtils.write(" | AttachTo=" + relationTypesMap.get(type), aOs, aEncoding);
}
IOUtils.write("\n", aOs, aEncoding);
Map<Feature, Map<Integer, String>> allAnnos = new HashMap<>();
allTypes: for (Type type : allTypes) {
for (Feature feature : type.getFeatures()) {
// coreference annotation not supported
if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue allTypes;
}
}
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end")) {
continue;
}
Map<Integer, String> tokenAnnoMap = new TreeMap<>();
setTokenAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
allAnnos.put(feature, tokenAnnoMap);
}
}
// get tokens where dependents are drown to
Map<Feature, Map<Integer, String>> relAnnos = new HashMap<>();
for (Type type : relationTypes) {
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
continue;
}
Map<Integer, String> tokenAnnoMap = new HashMap<>();
setRelationFeatureAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
relAnnos.put(feature, tokenAnnoMap);
}
}
// get tokens where dependents are drown from - the governor
Map<Type, Map<Integer, String>> governorAnnos = new HashMap<>();
for (Type type : relationTypes) {
Map<Integer, String> govAnnoMap = new HashMap<>();
setRelationGovernorPos(aJCas.getCas(), govAnnoMap, type);
governorAnnos.put(type, govAnnoMap);
}
int sentId = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
IOUtils.write("#id=" + sentId++ + "\n", aOs, aEncoding);
IOUtils.write("#text=" + sentence.getCoveredText().replace("\n", "") + "\n", aOs, aEncoding);
for (Token token : selectCovered(Token.class, sentence)) {
IOUtils.write(tokenIds.get(llCas.ll_getFSRef(token)) + "\t" + token.getCoveredText() + "\t", aOs, aEncoding);
// all span annotations on this token
for (Feature feature : spanFeatures.keySet()) {
String annos = allAnnos.get(feature).get(llCas.ll_getFSRef(token));
if (annos == null) {
if (multipleSpans.contains(spanFeatures.get(feature).getName())) {
IOUtils.write("O\t", aOs, aEncoding);
} else {
IOUtils.write("_\t", aOs, aEncoding);
}
} else {
IOUtils.write(annos + "\t", aOs, aEncoding);
}
}
for (Type type : relationTypes) {
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa") || feature.toString().equals("uima.tcas.Annotation:begin") || feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR) || feature.getShortName().equals(DEPENDENT)) {
continue;
}
String annos = relAnnos.get(feature).get(llCas.ll_getFSRef(token));
if (annos == null) {
IOUtils.write("_\t", aOs, aEncoding);
} else {
IOUtils.write(annos + "\t", aOs, aEncoding);
}
}
// the governor positions
String govPos = governorAnnos.get(type).get(llCas.ll_getFSRef(token));
if (govPos == null) {
IOUtils.write("_\t", aOs, aEncoding);
} else {
IOUtils.write(governorAnnos.get(type).get(llCas.ll_getFSRef(token)) + "\t", aOs, aEncoding);
}
}
IOUtils.write("\n", aOs, aEncoding);
}
IOUtils.write("\n", aOs, aEncoding);
}
}
use of org.apache.uima.cas.Type in project webanno by webanno.
the class ConstraintsGeneratorTest method testTwoConditions.
@Test
public void testTwoConditions() throws Exception {
JCas jcas = makeJCasOneSentence();
CAS cas = jcas.getCas();
List<Token> tokens = new ArrayList<>(select(jcas, Token.class));
Token t1 = tokens.get(0);
Token t2 = tokens.get(tokens.size() - 1);
NamedEntity gov = new NamedEntity(jcas, t1.getBegin(), t1.getEnd());
gov.setValue("Animal");
gov.addToIndexes();
NamedEntity dep = new NamedEntity(jcas, t2.getBegin(), t2.getEnd());
dep.setValue("NotWeight");
dep.addToIndexes();
Type relationType = cas.getTypeSystem().getType("webanno.custom.Relation");
AnnotationFS fs1 = cas.createAnnotation(relationType, dep.getBegin(), dep.getEnd());
FSUtil.setFeature(fs1, "Governor", gov);
FSUtil.setFeature(fs1, "Dependent", dep);
cas.addFsToIndexes(fs1);
ConstraintsGrammar parser = new ConstraintsGrammar(new FileInputStream("src/test/resources/rules/twoConditions.rules"));
Parse p = parser.Parse();
ParsedConstraints constraints = p.accept(new ParserVisitor());
Evaluator constraintsEvaluator = new ValuesGenerator();
List<PossibleValue> possibleValues = constraintsEvaluator.generatePossibleValues(fs1, "label", constraints);
System.out.println(possibleValues);
// "Weight" != "NotWeight", so the rule should not match
assertEquals(0, possibleValues.size());
}
use of org.apache.uima.cas.Type in project webanno by webanno.
the class DiffUtils method makeLinkFS.
public static FeatureStructure makeLinkFS(JCas aJCas, String aSlotLabel, int aTargetBegin, int aTargetEnd) {
Token token1 = new Token(aJCas, aTargetBegin, aTargetEnd);
token1.addToIndexes();
Type linkType = aJCas.getTypeSystem().getType(LINK_TYPE);
FeatureStructure linkA1 = aJCas.getCas().createFS(linkType);
linkA1.setStringValue(linkType.getFeatureByBaseName("role"), aSlotLabel);
linkA1.setFeatureValue(linkType.getFeatureByBaseName("target"), token1);
aJCas.getCas().addFsToIndexes(linkA1);
return linkA1;
}
Aggregations