use of org.apache.uima.fit.factory.JCasBuilder in project webanno by webanno.
the class Conll2009Reader method convert.
public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
if (readPos) {
try {
posMappingProvider.configure(aJCas.getCas());
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
}
JCasBuilder doc = new JCasBuilder(aJCas);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
// markers following each other.
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
// Tokens, Lemma, POS
Map<Integer, Token> tokens = new HashMap<Integer, Token>();
List<SemPred> preds = new ArrayList<>();
Iterator<String[]> wordIterator = words.iterator();
while (wordIterator.hasNext()) {
String[] word = wordIterator.next();
// Read token
Token token = doc.add(word[FORM], Token.class);
tokens.put(Integer.valueOf(word[ID]), token);
if (wordIterator.hasNext()) {
doc.add(" ");
}
// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
}
// Read part-of-speech tag
if (!UNUSED.equals(word[POS]) && readPos) {
Type posTag = posMappingProvider.getTagType(word[POS]);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
pos.setPosValue(word[POS].intern());
// WebAnno did not yet backport the coarse grained POS feature from
// DKPro Core 1.9.0
// POSUtils.assignCoarseValue(pos);
pos.addToIndexes();
token.setPos(pos);
}
// Read morphological features
if (!UNUSED.equals(word[FEAT]) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
morphtag.setValue(word[FEAT]);
morphtag.addToIndexes();
}
if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) {
SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd());
pred.setCategory(word[PRED]);
pred.addToIndexes();
preds.add(pred);
}
sentenceEnd = token.getEnd();
}
// Dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// Model the root as a loop onto itself
if (govId == 0) {
// Not using ROOT here because WebAnno cannot deal with elevated
// types
Dependency rel = new Dependency(aJCas);
rel.setGovernor(tokens.get(depId));
rel.setDependent(tokens.get(depId));
rel.setDependencyType(word[DEPREL]);
rel.setBegin(rel.getDependent().getBegin());
rel.setEnd(rel.getDependent().getEnd());
// This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas
// classes
FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC);
rel.addToIndexes();
} else {
Dependency rel = new Dependency(aJCas);
rel.setGovernor(tokens.get(govId));
rel.setDependent(tokens.get(depId));
rel.setDependencyType(word[DEPREL]);
rel.setBegin(rel.getDependent().getBegin());
rel.setEnd(rel.getDependent().getEnd());
// This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas
// classes
FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC);
rel.addToIndexes();
}
}
}
}
// Semantic arguments
if (readSemanticPredicate) {
// Get arguments for one predicate at a time
for (int p = 0; p < preds.size(); p++) {
List<SemArgLink> args = new ArrayList<>();
for (String[] word : words) {
if (!UNUSED.equals(word[APRED + p])) {
Token token = tokens.get(Integer.valueOf(word[ID]));
SemArg arg = new SemArg(aJCas, token.getBegin(), token.getEnd());
arg.addToIndexes();
SemArgLink link = new SemArgLink(aJCas);
link.setRole(word[APRED + p]);
link.setTarget(arg);
args.add(link);
}
}
SemPred pred = preds.get(p);
pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args));
}
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
use of org.apache.uima.fit.factory.JCasBuilder in project webanno by webanno.
the class ConllUReader method convert.
public void convert(JCas aJCas, BufferedReader aReader) throws IOException {
if (readPos) {
try {
posMappingProvider.configure(aJCas.getCas());
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
}
JCasBuilder doc = new JCasBuilder(aJCas);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
// markers following each other.
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
int surfaceBegin = -1;
int surfaceEnd = -1;
String surfaceString = null;
// Tokens, Lemma, POS
Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>();
for (String[] word : words) {
if (word[ID].contains("-")) {
String[] fragments = word[ID].split("-");
surfaceBegin = Integer.valueOf(fragments[0]);
surfaceEnd = Integer.valueOf(fragments[1]);
surfaceString = word[FORM];
continue;
}
// Read token
int tokenIdx = Integer.valueOf(word[ID]);
Token token = doc.add(word[FORM], Token.class);
tokens.put(tokenIdx, token);
if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) {
doc.add(" ");
}
// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.addToIndexes();
token.setLemma(lemma);
}
// Read part-of-speech tag
if (!UNUSED.equals(word[POSTAG]) && readPos) {
Type posTag = posMappingProvider.getTagType(word[POSTAG]);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd());
pos.setPosValue(word[POSTAG]);
pos.addToIndexes();
token.setPos(pos);
}
// Read morphological features
if (!UNUSED.equals(word[FEATS]) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd());
morphtag.setValue(word[FEATS]);
morphtag.addToIndexes();
token.setMorph(morphtag);
// Try parsing out individual feature values. Since the DKPro Core
// MorphologicalFeatures type is based on the definition from the UD project,
// we can do this rather straightforwardly.
Type morphType = morphtag.getType();
String[] items = word[FEATS].split("\\|");
for (String item : items) {
String[] keyValue = item.split("=");
StringBuilder key = new StringBuilder(keyValue[0]);
key.setCharAt(0, Character.toLowerCase(key.charAt(0)));
String value = keyValue[1];
Feature feat = morphType.getFeatureByBaseName(key.toString());
if (feat != null) {
morphtag.setStringValue(feat, value);
}
}
}
// Read surface form
if (tokenIdx == surfaceEnd) {
int begin = tokens.get(surfaceBegin).getBegin();
int end = tokens.get(surfaceEnd).getEnd();
SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end);
surfaceForm.setValue(surfaceString);
surfaceForm.addToIndexes();
surfaceBegin = -1;
surfaceEnd = -1;
surfaceString = null;
}
sentenceEnd = token.getEnd();
}
// Dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
// Model the root as a loop onto itself
makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word);
}
if (!UNUSED.equals(word[DEPS])) {
// list items separated by vertical bar
String[] items = word[DEPS].split("\\|");
for (String item : items) {
String[] sItem = item.split(":");
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(sItem[0]);
makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word);
}
}
}
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
use of org.apache.uima.fit.factory.JCasBuilder in project dkpro-tc by dkpro.
the class NGramUtilsTest method phoneticNgramsTest.
@Test
public void phoneticNgramsTest() throws Exception {
String text = "This is a big house";
JCas jcas = JCasFactory.createJCas();
jcas.setDocumentLanguage("en");
jcas.setDocumentText(text);
TextClassificationTarget aTarget = new TextClassificationTarget(jcas, 0, text.length());
aTarget.addToIndexes();
JCasBuilder cb = new JCasBuilder(jcas);
for (String token : text.split(" ")) {
cb.add(token, Token.class);
}
cb.add(0, Sentence.class);
FrequencyDistribution<String> ngrams = PhoneticNGramMC.getDocumentPhoneticNgrams(jcas, aTarget, 1, 3);
assertEquals(12, ngrams.getN());
assertTrue(ngrams.contains("I000"));
assertTrue(ngrams.contains("T200"));
}
use of org.apache.uima.fit.factory.JCasBuilder in project dkpro-tc by dkpro.
the class NGramUtilsTest method characterBiGrams.
@Test
public void characterBiGrams() throws Exception {
String text = "A house";
JCas jcas = JCasFactory.createJCas();
jcas.setDocumentLanguage("en");
jcas.setDocumentText(text);
JCasBuilder cb = new JCasBuilder(jcas);
for (String token : text.split(" ")) {
cb.add(token, Token.class);
}
TextClassificationTarget tu = new TextClassificationTarget(jcas, 2, 7);
tu.addToIndexes();
FrequencyDistribution<String> ngrams = CharacterNGramMC.getAnnotationCharacterNgrams(tu, false, 2, 3, '^', '$');
for (String s : ngrams.getKeys()) {
System.out.println(s);
}
assertEquals(11, ngrams.getN());
assertTrue(ngrams.contains("^h"));
assertTrue(ngrams.contains("ho"));
assertTrue(ngrams.contains("ou"));
assertTrue(ngrams.contains("us"));
assertTrue(ngrams.contains("se"));
assertTrue(ngrams.contains("se$"));
assertTrue(ngrams.contains("^ho"));
assertTrue(ngrams.contains("hou"));
assertTrue(ngrams.contains("ous"));
assertTrue(ngrams.contains("use"));
assertTrue(ngrams.contains("se$"));
}
use of org.apache.uima.fit.factory.JCasBuilder in project webanno by webanno.
the class BratAjaxCasUtilTest method testIsSameSentence.
@Test
public void testIsSameSentence() throws Exception {
JCas jcas = JCasFactory.createJCas();
JCasBuilder jb = new JCasBuilder(jcas);
Sentence s1 = jb.add("Sentence 1.", Sentence.class);
jb.add(" ");
Sentence s2 = jb.add("Sentence 2.", Sentence.class);
jb.close();
assertTrue(isSameSentence(jcas, s2.getBegin(), s2.getEnd()));
assertTrue(isSameSentence(jcas, s2.getEnd(), s2.getBegin()));
assertTrue(isSameSentence(jcas, s1.getBegin() + 1, s1.getEnd() - 1));
assertTrue(isSameSentence(jcas, s1.getEnd() - 1, s1.getBegin() + 1));
assertTrue(isSameSentence(jcas, s1.getBegin(), s1.getEnd()));
assertTrue(isSameSentence(jcas, s1.getEnd(), s1.getBegin()));
assertFalse(isSameSentence(jcas, s2.getBegin(), s1.getBegin()));
assertFalse(isSameSentence(jcas, s1.getBegin(), s2.getBegin()));
assertTrue(isSameSentence(jcas, 0, 0));
}
Aggregations