use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebannoTsv3Writer method setTokenSentenceAddress.
private void setTokenSentenceAddress(JCas aJCas) {
int sentNMumber = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
int lineNumber = 1;
for (Token token : selectCovered(Token.class, sentence)) {
AnnotationUnit unit = new AnnotationUnit(token.getBegin(), token.getEnd(), false, token.getCoveredText());
units.add(unit);
if (lineNumber == 1) {
sentenceUnits.put(unit, sentence.getCoveredText());
}
unitsLineNumber.put(unit, sentNMumber + "-" + lineNumber);
lineNumber++;
}
sentNMumber++;
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class ConllUWriter method convert.
private void convert(JCas aJCas, PrintWriter aOut) {
Map<SurfaceForm, Collection<Token>> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, Token.class);
Int2ObjectMap<SurfaceForm> surfaceBeginIdx = new Int2ObjectOpenHashMap<>();
for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) {
surfaceBeginIdx.put(sf.getBegin(), sf);
}
for (Sentence sentence : select(aJCas, Sentence.class)) {
HashMap<Token, Row> ctokens = new LinkedHashMap<>();
// Tokens
List<Token> tokens = selectCovered(Token.class, sentence);
for (int i = 0; i < tokens.size(); i++) {
Row row = new Row();
row.id = i + 1;
row.token = tokens.get(i);
row.noSpaceAfter = (i + 1 < tokens.size()) && row.token.getEnd() == tokens.get(i + 1).getBegin();
ctokens.put(row.token, row);
}
// Dependencies
for (Dependency rel : selectCovered(Dependency.class, sentence)) {
String flavor = FSUtil.getFeature(rel, "flavor", String.class);
if (StringUtils.isBlank(flavor) || DependencyFlavor.BASIC.equals(flavor)) {
ctokens.get(rel.getDependent()).deprel = rel;
} else {
ctokens.get(rel.getDependent()).deps.add(rel);
}
}
// Write sentence in CONLL-U format
for (Row row : ctokens.values()) {
String lemma = UNUSED;
if (writeLemma && (row.token.getLemma() != null)) {
lemma = row.token.getLemma().getValue();
}
String pos = UNUSED;
String cpos = UNUSED;
if (writePos && (row.token.getPos() != null)) {
POS posAnno = row.token.getPos();
pos = posAnno.getPosValue();
cpos = dkpro2ud.get(posAnno.getClass());
if (StringUtils.isBlank(cpos)) {
cpos = pos;
}
}
int headId = UNUSED_INT;
String deprel = UNUSED;
String deps = UNUSED;
if (writeDependency) {
if ((row.deprel != null)) {
deprel = row.deprel.getDependencyType();
headId = ctokens.get(row.deprel.getGovernor()).id;
if (headId == row.id) {
// ROOT dependencies may be modeled as a loop, ignore these.
headId = 0;
}
}
StringBuilder depsBuf = new StringBuilder();
for (Dependency d : row.deps) {
if (depsBuf.length() > 0) {
depsBuf.append('|');
}
// Resolve self-looping root to 0-indexed root
int govId = ctokens.get(d.getGovernor()).id;
if (govId == row.id) {
govId = 0;
}
depsBuf.append(govId);
depsBuf.append(':');
depsBuf.append(d.getDependencyType());
}
if (depsBuf.length() > 0) {
deps = depsBuf.toString();
}
}
String head = UNUSED;
if (headId != UNUSED_INT) {
head = Integer.toString(headId);
}
String feats = UNUSED;
if (writeMorph && (row.token.getMorph() != null)) {
feats = row.token.getMorph().getValue();
}
String misc = UNUSED;
if (row.noSpaceAfter) {
misc = "SpaceAfter=No";
}
SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin());
if (sf != null) {
@SuppressWarnings({ "unchecked", "rawtypes" }) List<Token> covered = (List) surfaceIdx.get(sf);
int id1 = ctokens.get(covered.get(0)).id;
int id2 = ctokens.get(covered.get(covered.size() - 1)).id;
aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED);
}
aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, row.token.getCoveredText(), lemma, cpos, pos, feats, head, deprel, deps, misc);
}
aOut.println();
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebAnnoSemanticGraphReader method convertToCas.
public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {
StringBuilder text = new StringBuilder();
LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
int tokenBeginPosition = 0;
while (lineIterator.hasNext()) {
String line = lineIterator.next();
String[] contents = line.split("\t>\t|\tX\t");
int sentenceBegin = tokenBeginPosition;
int chainBegin = tokenBeginPosition;
int chainEnd = 0;
StringTokenizer st = new StringTokenizer(contents[0]);
while (st.hasMoreTokens()) {
String content = st.nextToken();
Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + content.length());
outToken.addToIndexes();
tokenBeginPosition = outToken.getEnd() + 1;
chainEnd = tokenBeginPosition;
text.append(content).append(" ");
}
CoreferenceChain chain = new CoreferenceChain(aJCas);
CoreferenceLink link = new CoreferenceLink(aJCas, chainBegin, chainEnd - 1);
link.setReferenceType("text");
link.addToIndexes();
chain.setFirst(link);
if (line.contains("\t>\t")) {
link.setReferenceRelation("entails");
Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + 1);
outToken.addToIndexes();
tokenBeginPosition = outToken.getEnd() + 1;
text.append("> ");
} else {
link.setReferenceRelation("do not entails");
Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + 1);
outToken.addToIndexes();
tokenBeginPosition = outToken.getEnd() + 1;
text.append("X ");
}
chainBegin = tokenBeginPosition;
st = new StringTokenizer(contents[0]);
while (st.hasMoreTokens()) {
String content = st.nextToken();
Token outToken = new Token(aJCas, tokenBeginPosition, tokenBeginPosition + content.length());
outToken.addToIndexes();
tokenBeginPosition = outToken.getEnd() + 1;
chainEnd = tokenBeginPosition;
text.append(content).append(" ");
}
CoreferenceLink nextLink = new CoreferenceLink(aJCas, chainBegin, chainEnd - 1);
nextLink.setReferenceType("hypothesis");
nextLink.addToIndexes();
link.setNext(nextLink);
chain.addToIndexes();
text.append("\n");
Sentence outSentence = new Sentence(aJCas);
outSentence.setBegin(sentenceBegin);
outSentence.setEnd(tokenBeginPosition);
outSentence.addToIndexes();
tokenBeginPosition = tokenBeginPosition + 1;
sentenceBegin = tokenBeginPosition;
}
aJCas.setDocumentText(text.toString());
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class LineOrientedTextReader method createSentence.
protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd) {
int[] span = new int[] { aBegin, aEnd };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
Sentence seg = new Sentence(aJCas, span[0], span[1]);
seg.addToIndexes(aJCas);
return seg;
} else {
return null;
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence in project webanno by webanno.
the class WebannoTsv1Reader method createSentence.
/**
* Add sentence layer to CAS
*/
private void createSentence(JCas aJCas, List<Integer> firstTokenInSentence, Map<String, Token> tokensStored) {
for (int i = 0; i < firstTokenInSentence.size(); i++) {
Sentence outSentence = new Sentence(aJCas);
// Only last sentence, and no the only sentence in the document (i!=0)
if (i == firstTokenInSentence.size() - 1 && i != 0) {
outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd());
outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd());
outSentence.addToIndexes();
break;
}
if (i == firstTokenInSentence.size() - 1 && i == 0) {
outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getBegin());
outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd());
outSentence.addToIndexes();
} else if (i == 0) {
outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getBegin());
outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)).getEnd());
outSentence.addToIndexes();
} else {
outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd() + 1);
outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)).getEnd());
outSentence.addToIndexes();
}
}
}
Aggregations