Search in sources :

Example 6 with CAS

use of org.apache.uima.cas.CAS in project deeplearning4j by deeplearning4j.

the class TreeParser method getTrees.

/**
     * Gets trees from text.
     * First a sentence segmenter is used to segment the training examples in to sentences.
     * Sentences are then turned in to trees and returned.
     * @param text the text to process
     * @param preProcessor the pre processor to use for pre processing sentences
     * @return the list of trees
     * @throws Exception
     */
public List<Tree> getTrees(String text, SentencePreProcessor preProcessor) throws Exception {
    if (text.isEmpty())
        return new ArrayList<>();
    CAS c = pool.getCas();
    if (preProcessor != null)
        text = preProcessor.preProcess(text);
    c.setDocumentText(text);
    tokenizer.process(c);
    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    List<Pair<String, MultiDimensionalMap<Integer, Integer, String>>> list = new ArrayList<>();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
        Pair<String, MultiDimensionalMap<Integer, Integer, String>> p = ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
        c2.setDocumentText(p.getFirst());
        list.add(p);
        tokenizer.process(c2);
        parser.process(c2);
        //build the tree based on this
        TopTreebankNode node = JCasUtil.selectSingle(c.getJCas(), TopTreebankNode.class);
        ret.add(TreeFactory.buildTree(node));
    }
    pool.releaseCas(c2);
    for (Tree t : ret) {
        addPreTerminal(t);
    }
    return ret;
}
Also used : ArrayList(java.util.ArrayList) Token(org.cleartk.token.type.Token) MultiDimensionalMap(org.deeplearning4j.util.MultiDimensionalMap) CAS(org.apache.uima.cas.CAS) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) Tree(org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree) Sentence(org.cleartk.token.type.Sentence) Pair(org.deeplearning4j.berkeley.Pair)

Example 7 with CAS

use of org.apache.uima.cas.CAS in project deeplearning4j by deeplearning4j.

the class TreeParser method getTreesWithLabels.

/**
     * Gets trees from text.
     * First a sentence segmenter is used to segment the training examples in to sentences.
     * Sentences are then turned in to trees and returned.
     *
     * This will also process sentences with the following label format:
     * <YOURLABEL> some text </YOURLABEL>
     *
     * This will allow you to iterate on and label sentences and label spans yourself.
     *
     * @param text the text to process
     * @param labels
     * @return the list of trees
     * @throws Exception
     */
public List<Tree> getTreesWithLabels(String text, List<String> labels) throws Exception {
    CAS c = pool.getCas();
    c.setDocumentText(text);
    tokenizer.process(c);
    List<String> lowerCaseLabels = new ArrayList<>();
    for (String s : labels) lowerCaseLabels.add(s.toLowerCase());
    labels = lowerCaseLabels;
    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
        Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringsWithLabels = ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
        c2.setDocumentText(stringsWithLabels.getFirst());
        tokenizer.process(c2);
        parser.process(c2);
        //build the tree based on this
        //damn it
        List<TopTreebankNode> nodes = new ArrayList<>(JCasUtil.select(c2.getJCas(), TopTreebankNode.class));
        if (nodes.size() > 1) {
            log.warn("More than one top level node for a treebank parse. Only accepting first input node.");
        } else if (nodes.isEmpty()) {
            c2.reset();
            continue;
        }
        Collection<String> labels2 = stringsWithLabels.getSecond().values();
        Set<String> diff = SetUtils.difference(labels2, labels);
        if (!diff.isEmpty()) {
            log.warn("Found invalid sentence. Skipping");
            c2.reset();
            continue;
        }
        TopTreebankNode node = nodes.get(0);
        ret.add(TreeFactory.buildTree(node, stringsWithLabels, labels));
        c2.reset();
    }
    pool.releaseCas(c);
    pool.releaseCas(c2);
    return ret;
}
Also used : ArrayList(java.util.ArrayList) Token(org.cleartk.token.type.Token) MultiDimensionalMap(org.deeplearning4j.util.MultiDimensionalMap) CAS(org.apache.uima.cas.CAS) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) Tree(org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree) Sentence(org.cleartk.token.type.Sentence)

Aggregations

CAS (org.apache.uima.cas.CAS)7 ArrayList (java.util.ArrayList)5 Sentence (org.cleartk.token.type.Sentence)5 TopTreebankNode (org.cleartk.syntax.constituent.type.TopTreebankNode)4 Token (org.cleartk.token.type.Token)4 Tree (org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree)3 TreebankNode (org.cleartk.syntax.constituent.type.TreebankNode)2 MultiDimensionalMap (org.deeplearning4j.util.MultiDimensionalMap)2 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)1 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)1 Pair (org.deeplearning4j.berkeley.Pair)1