use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.
the class TreeParser method getTreebankTrees.
/**
* Gets trees from text.
* First a sentence segmenter is used to segment the training examples in to sentences.
* Sentences are then turned in to trees and returned.
* @param text the text to process
* @return the list of trees
* @throws Exception
*/
public List<TreebankNode> getTreebankTrees(String text) throws Exception {
if (text.isEmpty())
return new ArrayList<>();
CAS c = pool.getCas();
c.setDocumentText(text);
tokenizer.process(c);
List<TreebankNode> ret = new ArrayList<>();
for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
List<String> tokens = new ArrayList<>();
CAS c2 = tokenizer.newCAS();
for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
c2.setDocumentText(sentence.getCoveredText());
tokenizer.process(c2);
parser.process(c2);
//build the tree based on this
TopTreebankNode node = JCasUtil.selectSingle(c2.getJCas(), TopTreebankNode.class);
ret.add(node);
}
pool.releaseCas(c);
return ret;
}
use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.
the class UimaSentenceIterator method nextSentence.
@Override
public synchronized String nextSentence() {
if (sentences == null || !sentences.hasNext()) {
try {
if (getReader().hasNext()) {
CAS cas = resource.retrieve();
try {
getReader().getNext(cas);
} catch (Exception e) {
log.warn("Done iterating returning an empty string");
return "";
}
resource.getAnalysisEngine().process(cas);
List<String> list = new ArrayList<>();
for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
list.add(sentence.getCoveredText());
}
sentences = list.iterator();
//needs to be next cas
while (!sentences.hasNext()) {
//sentence is empty; go to another cas
if (reader.hasNext()) {
cas.reset();
getReader().getNext(cas);
resource.getAnalysisEngine().process(cas);
for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
list.add(sentence.getCoveredText());
}
sentences = list.iterator();
} else
return null;
}
String ret = sentences.next();
if (this.getPreProcessor() != null)
ret = this.getPreProcessor().preProcess(ret);
return ret;
}
return null;
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
String ret = sentences.next();
if (this.getPreProcessor() != null)
ret = this.getPreProcessor().preProcess(ret);
return ret;
}
}
use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.
the class TreeParser method getTrees.
/**
* Gets trees from text.
* First a sentence segmenter is used to segment the training examples in to sentences.
* Sentences are then turned in to trees and returned.
* @param text the text to process
* @return the list of trees
* @throws Exception
*/
public List<Tree> getTrees(String text) throws Exception {
CAS c = pool.getCas();
c.setDocumentText(text);
tokenizer.process(c);
List<Tree> ret = new ArrayList<>();
CAS c2 = pool.getCas();
for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
List<String> tokens = new ArrayList<>();
for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
c2.setDocumentText(sentence.getCoveredText());
tokenizer.process(c2);
parser.process(c2);
//build the tree based on this
TopTreebankNode node = JCasUtil.selectSingle(c2.getJCas(), TopTreebankNode.class);
log.info("Tree bank parse " + node.getTreebankParse());
for (TreebankNode node2 : JCasUtil.select(c2.getJCas(), TreebankNode.class)) {
log.info("Node val " + node2.getNodeValue() + " and label " + node2.getNodeType() + " and tags was " + node2.getNodeTags());
}
ret.add(TreeFactory.buildTree(node));
c2.reset();
}
pool.releaseCas(c);
pool.releaseCas(c2);
return ret;
}
use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.
the class TreeParser method getTrees.
/**
* Gets trees from text.
* First a sentence segmenter is used to segment the training examples in to sentences.
* Sentences are then turned in to trees and returned.
* @param text the text to process
* @param preProcessor the pre processor to use for pre processing sentences
* @return the list of trees
* @throws Exception
*/
public List<Tree> getTrees(String text, SentencePreProcessor preProcessor) throws Exception {
if (text.isEmpty())
return new ArrayList<>();
CAS c = pool.getCas();
if (preProcessor != null)
text = preProcessor.preProcess(text);
c.setDocumentText(text);
tokenizer.process(c);
List<Tree> ret = new ArrayList<>();
CAS c2 = pool.getCas();
List<Pair<String, MultiDimensionalMap<Integer, Integer, String>>> list = new ArrayList<>();
for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
List<String> tokens = new ArrayList<>();
for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
Pair<String, MultiDimensionalMap<Integer, Integer, String>> p = ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
c2.setDocumentText(p.getFirst());
list.add(p);
tokenizer.process(c2);
parser.process(c2);
//build the tree based on this
TopTreebankNode node = JCasUtil.selectSingle(c.getJCas(), TopTreebankNode.class);
ret.add(TreeFactory.buildTree(node));
}
pool.releaseCas(c2);
for (Tree t : ret) {
addPreTerminal(t);
}
return ret;
}
use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.
the class TreeParser method getTreesWithLabels.
/**
* Gets trees from text.
* First a sentence segmenter is used to segment the training examples in to sentences.
* Sentences are then turned in to trees and returned.
*
* This will also process sentences with the following label format:
* <YOURLABEL> some text </YOURLABEL>
*
* This will allow you to iterate on and label sentences and label spans yourself.
*
* @param text the text to process
* @param labels
* @return the list of trees
* @throws Exception
*/
public List<Tree> getTreesWithLabels(String text, List<String> labels) throws Exception {
CAS c = pool.getCas();
c.setDocumentText(text);
tokenizer.process(c);
List<String> lowerCaseLabels = new ArrayList<>();
for (String s : labels) lowerCaseLabels.add(s.toLowerCase());
labels = lowerCaseLabels;
List<Tree> ret = new ArrayList<>();
CAS c2 = pool.getCas();
for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
List<String> tokens = new ArrayList<>();
for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringsWithLabels = ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
c2.setDocumentText(stringsWithLabels.getFirst());
tokenizer.process(c2);
parser.process(c2);
//build the tree based on this
//damn it
List<TopTreebankNode> nodes = new ArrayList<>(JCasUtil.select(c2.getJCas(), TopTreebankNode.class));
if (nodes.size() > 1) {
log.warn("More than one top level node for a treebank parse. Only accepting first input node.");
} else if (nodes.isEmpty()) {
c2.reset();
continue;
}
Collection<String> labels2 = stringsWithLabels.getSecond().values();
Set<String> diff = SetUtils.difference(labels2, labels);
if (!diff.isEmpty()) {
log.warn("Found invalid sentence. Skipping");
c2.reset();
continue;
}
TopTreebankNode node = nodes.get(0);
ret.add(TreeFactory.buildTree(node, stringsWithLabels, labels));
c2.reset();
}
pool.releaseCas(c);
pool.releaseCas(c2);
return ret;
}
Aggregations