use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm in project webanno by webanno.
the class ConllUReader method convert.
public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
if (readPos) {
try {
posMappingProvider.configure(aJCas.getCas());
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
}
JCasBuilder doc = new JCasBuilder(aJCas);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
// markers following each other.
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
int surfaceBegin = -1;
int surfaceEnd = -1;
String surfaceString = null;
// Tokens, Lemma, POS
Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
for (String[] word : words) {
if (word[ID].contains("-")) {
String[] fragments = word[ID].split("-");
surfaceBegin = Integer.valueOf(fragments[0]);
surfaceEnd = Integer.valueOf(fragments[1]);
surfaceString = word[FORM];
continue;
}
// Read token
int tokenIdx = Integer.valueOf(word[ID]);
Token token = doc.add(word[FORM], Token.class);
tokens.put(tokenIdx, token);
if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
doc.add(" ");
}
// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
}
// Read part-of-speech tag
if (!UNUSED.equals(word[POSTAG]) && readPos) {
Type posTag = posMappingProvider.getTagType(word[POSTAG]);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
pos.setPosValue(word[POSTAG]);
pos.addToIndexes();
token.setPos(pos);
}
// Read morphological features
if (!UNUSED.equals(word[FEATS]) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
morphtag.setValue(word[FEATS]);
morphtag.addToIndexes();
token.setMorph(morphtag);
// Try parsing out individual feature values. Since the DKPro Core
// MorphologicalFeatures type is based on the definition from the UD project,
// we can do this rather straightforwardly.
Type morphType = morphtag.getType();
String[] items = word[FEATS].split("\\|");
for (String item : items) {
String[] keyValue = item.split("=");
StringBuilder key = new StringBuilder(keyValue[0]);
key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
String value = keyValue[1];
Feature feat = morphType.getFeatureByBaseName(key.toString());
if (feat != null) {
morphtag.setStringValue(feat, value);
}
}
}
// Read surface form
if (tokenIdx == surfaceEnd) {
int begin = tokens.get(surfaceBegin).getBegin();
int end = tokens.get(surfaceEnd).getEnd();
SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
surfaceForm.setValue(surfaceString);
surfaceForm.addToIndexes();
surfaceBegin = -1;
surfaceEnd = -1;
surfaceString = null;
}
sentenceEnd = token.getEnd();
}
// Dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// Model the root as a loop onto itself
makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
}
if (!UNUSED.equals(word[DEPS])) {
// list items separated by vertical bar
String[] items = word[DEPS].split("\\|");
for (String item : items) {
String[] sItem = item.split(":");
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(sItem[0]);
makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
}
}
}
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm in project webanno by webanno.
the class ConllUWriter method convert.
private void convert(JCas aJCas, PrintWriter aOut) {
Map<SurfaceForm, Collection<Token>> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, Token.class);
Int2ObjectMap<SurfaceForm> surfaceBeginIdx = new Int2ObjectOpenHashMap<>();
for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) {
surfaceBeginIdx.put(sf.getBegin(), sf);
}
for (Sentence sentence : select(aJCas, Sentence.class)) {
HashMap<Token, Row> ctokens = new LinkedHashMap<>();
// Tokens
List<Token> tokens = selectCovered(Token.class, sentence);
for (int i = 0; i < tokens.size(); i++) {
Row row = new Row();
row.id = i + 1;
row.token = tokens.get(i);
row.noSpaceAfter = (i + 1 < tokens.size()) && row.token.getEnd() == tokens.get(i + 1).getBegin();
ctokens.put(row.token, row);
}
// Dependencies
for (Dependency rel : selectCovered(Dependency.class, sentence)) {
String flavor = FSUtil.getFeature(rel, "flavor", String.class);
if (StringUtils.isBlank(flavor) || DependencyFlavor.BASIC.equals(flavor)) {
ctokens.get(rel.getDependent()).deprel = rel;
} else {
ctokens.get(rel.getDependent()).deps.add(rel);
}
}
// Write sentence in CONLL-U format
for (Row row : ctokens.values()) {
String lemma = UNUSED;
if (writeLemma && (row.token.getLemma() != null)) {
lemma = row.token.getLemma().getValue();
}
String pos = UNUSED;
String cpos = UNUSED;
if (writePos && (row.token.getPos() != null)) {
POS posAnno = row.token.getPos();
pos = posAnno.getPosValue();
cpos = dkpro2ud.get(posAnno.getClass());
if (StringUtils.isBlank(cpos)) {
cpos = pos;
}
}
int headId = UNUSED_INT;
String deprel = UNUSED;
String deps = UNUSED;
if (writeDependency) {
if ((row.deprel != null)) {
deprel = row.deprel.getDependencyType();
headId = ctokens.get(row.deprel.getGovernor()).id;
if (headId == row.id) {
// ROOT dependencies may be modeled as a loop, ignore these.
headId = 0;
}
}
StringBuilder depsBuf = new StringBuilder();
for (Dependency d : row.deps) {
if (depsBuf.length() > 0) {
depsBuf.append('|');
}
// Resolve self-looping root to 0-indexed root
int govId = ctokens.get(d.getGovernor()).id;
if (govId == row.id) {
govId = 0;
}
depsBuf.append(govId);
depsBuf.append(':');
depsBuf.append(d.getDependencyType());
}
if (depsBuf.length() > 0) {
deps = depsBuf.toString();
}
}
String head = UNUSED;
if (headId != UNUSED_INT) {
head = Integer.toString(headId);
}
String feats = UNUSED;
if (writeMorph && (row.token.getMorph() != null)) {
feats = row.token.getMorph().getValue();
}
String misc = UNUSED;
if (row.noSpaceAfter) {
misc = "SpaceAfter=No";
}
SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin());
if (sf != null) {
@SuppressWarnings({ "unchecked", "rawtypes" }) List<Token> covered = (List) surfaceIdx.get(sf);
int id1 = ctokens.get(covered.get(0)).id;
int id2 = ctokens.get(covered.get(covered.size() - 1)).id;
aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED);
}
aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, row.token.getCoveredText(), lemma, cpos, pos, feats, head, deprel, deps, misc);
}
aOut.println();
}
}
Aggregations