Search in sources :

Example 1 with Sentence

use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.

the class TreeParser method getTreebankTrees.

/**
     * Gets trees from text.
     * First a sentence segmenter is used to segment the training examples in to sentences.
     * Sentences are then turned in to trees and returned.
     * @param text the text to process
     * @return the list of trees
     * @throws Exception
     */
public List<TreebankNode> getTreebankTrees(String text) throws Exception {
    if (text.isEmpty())
        return new ArrayList<>();
    CAS c = pool.getCas();
    c.setDocumentText(text);
    tokenizer.process(c);
    List<TreebankNode> ret = new ArrayList<>();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        CAS c2 = tokenizer.newCAS();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
        c2.setDocumentText(sentence.getCoveredText());
        tokenizer.process(c2);
        parser.process(c2);
        //build the tree based on this
        TopTreebankNode node = JCasUtil.selectSingle(c2.getJCas(), TopTreebankNode.class);
        ret.add(node);
    }
    pool.releaseCas(c);
    return ret;
}
Also used : CAS(org.apache.uima.cas.CAS) TreebankNode(org.cleartk.syntax.constituent.type.TreebankNode) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) ArrayList(java.util.ArrayList) Token(org.cleartk.token.type.Token) Sentence(org.cleartk.token.type.Sentence)

Example 2 with Sentence

use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.

the class UimaSentenceIterator method nextSentence.

@Override
public synchronized String nextSentence() {
    if (sentences == null || !sentences.hasNext()) {
        try {
            if (getReader().hasNext()) {
                CAS cas = resource.retrieve();
                try {
                    getReader().getNext(cas);
                } catch (Exception e) {
                    log.warn("Done iterating returning an empty string");
                    return "";
                }
                resource.getAnalysisEngine().process(cas);
                List<String> list = new ArrayList<>();
                for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
                    list.add(sentence.getCoveredText());
                }
                sentences = list.iterator();
                //needs to be next cas
                while (!sentences.hasNext()) {
                    //sentence is empty; go to another cas
                    if (reader.hasNext()) {
                        cas.reset();
                        getReader().getNext(cas);
                        resource.getAnalysisEngine().process(cas);
                        for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
                            list.add(sentence.getCoveredText());
                        }
                        sentences = list.iterator();
                    } else
                        return null;
                }
                String ret = sentences.next();
                if (this.getPreProcessor() != null)
                    ret = this.getPreProcessor().preProcess(ret);
                return ret;
            }
            return null;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    } else {
        String ret = sentences.next();
        if (this.getPreProcessor() != null)
            ret = this.getPreProcessor().preProcess(ret);
        return ret;
    }
}
Also used : CAS(org.apache.uima.cas.CAS) ArrayList(java.util.ArrayList) Sentence(org.cleartk.token.type.Sentence) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException)

Example 3 with Sentence

use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.

the class TreeParser method getTrees.

/**
     * Gets trees from text.
     * First a sentence segmenter is used to segment the training examples in to sentences.
     * Sentences are then turned in to trees and returned.
     * @param text the text to process
     * @return the list of trees
     * @throws Exception
     */
public List<Tree> getTrees(String text) throws Exception {
    CAS c = pool.getCas();
    c.setDocumentText(text);
    tokenizer.process(c);
    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
        c2.setDocumentText(sentence.getCoveredText());
        tokenizer.process(c2);
        parser.process(c2);
        //build the tree based on this
        TopTreebankNode node = JCasUtil.selectSingle(c2.getJCas(), TopTreebankNode.class);
        log.info("Tree bank parse " + node.getTreebankParse());
        for (TreebankNode node2 : JCasUtil.select(c2.getJCas(), TreebankNode.class)) {
            log.info("Node val " + node2.getNodeValue() + " and label " + node2.getNodeType() + " and tags was " + node2.getNodeTags());
        }
        ret.add(TreeFactory.buildTree(node));
        c2.reset();
    }
    pool.releaseCas(c);
    pool.releaseCas(c2);
    return ret;
}
Also used : CAS(org.apache.uima.cas.CAS) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) TreebankNode(org.cleartk.syntax.constituent.type.TreebankNode) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) ArrayList(java.util.ArrayList) Tree(org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree) Token(org.cleartk.token.type.Token) Sentence(org.cleartk.token.type.Sentence)

Example 4 with Sentence

use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.

the class TreeParser method getTrees.

/**
     * Gets trees from text.
     * First a sentence segmenter is used to segment the training examples in to sentences.
     * Sentences are then turned in to trees and returned.
     * @param text the text to process
     * @param preProcessor the pre processor to use for pre processing sentences
     * @return the list of trees
     * @throws Exception
     */
public List<Tree> getTrees(String text, SentencePreProcessor preProcessor) throws Exception {
    if (text.isEmpty())
        return new ArrayList<>();
    CAS c = pool.getCas();
    if (preProcessor != null)
        text = preProcessor.preProcess(text);
    c.setDocumentText(text);
    tokenizer.process(c);
    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    List<Pair<String, MultiDimensionalMap<Integer, Integer, String>>> list = new ArrayList<>();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
        Pair<String, MultiDimensionalMap<Integer, Integer, String>> p = ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
        c2.setDocumentText(p.getFirst());
        list.add(p);
        tokenizer.process(c2);
        parser.process(c2);
        //build the tree based on this
        TopTreebankNode node = JCasUtil.selectSingle(c.getJCas(), TopTreebankNode.class);
        ret.add(TreeFactory.buildTree(node));
    }
    pool.releaseCas(c2);
    for (Tree t : ret) {
        addPreTerminal(t);
    }
    return ret;
}
Also used : ArrayList(java.util.ArrayList) Token(org.cleartk.token.type.Token) MultiDimensionalMap(org.deeplearning4j.util.MultiDimensionalMap) CAS(org.apache.uima.cas.CAS) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) Tree(org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree) Sentence(org.cleartk.token.type.Sentence) Pair(org.deeplearning4j.berkeley.Pair)

Example 5 with Sentence

use of org.cleartk.token.type.Sentence in project deeplearning4j by deeplearning4j.

the class TreeParser method getTreesWithLabels.

/**
     * Gets trees from text.
     * First a sentence segmenter is used to segment the training examples in to sentences.
     * Sentences are then turned in to trees and returned.
     *
     * This will also process sentences with the following label format:
     * <YOURLABEL> some text </YOURLABEL>
     *
     * This will allow you to iterate on and label sentences and label spans yourself.
     *
     * @param text the text to process
     * @param labels
     * @return the list of trees
     * @throws Exception
     */
public List<Tree> getTreesWithLabels(String text, List<String> labels) throws Exception {
    CAS c = pool.getCas();
    c.setDocumentText(text);
    tokenizer.process(c);
    List<String> lowerCaseLabels = new ArrayList<>();
    for (String s : labels) lowerCaseLabels.add(s.toLowerCase());
    labels = lowerCaseLabels;
    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence)) tokens.add(t.getCoveredText());
        Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringsWithLabels = ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
        c2.setDocumentText(stringsWithLabels.getFirst());
        tokenizer.process(c2);
        parser.process(c2);
        //build the tree based on this
        //damn it
        List<TopTreebankNode> nodes = new ArrayList<>(JCasUtil.select(c2.getJCas(), TopTreebankNode.class));
        if (nodes.size() > 1) {
            log.warn("More than one top level node for a treebank parse. Only accepting first input node.");
        } else if (nodes.isEmpty()) {
            c2.reset();
            continue;
        }
        Collection<String> labels2 = stringsWithLabels.getSecond().values();
        Set<String> diff = SetUtils.difference(labels2, labels);
        if (!diff.isEmpty()) {
            log.warn("Found invalid sentence. Skipping");
            c2.reset();
            continue;
        }
        TopTreebankNode node = nodes.get(0);
        ret.add(TreeFactory.buildTree(node, stringsWithLabels, labels));
        c2.reset();
    }
    pool.releaseCas(c);
    pool.releaseCas(c2);
    return ret;
}
Also used : ArrayList(java.util.ArrayList) Token(org.cleartk.token.type.Token) MultiDimensionalMap(org.deeplearning4j.util.MultiDimensionalMap) CAS(org.apache.uima.cas.CAS) TopTreebankNode(org.cleartk.syntax.constituent.type.TopTreebankNode) Tree(org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree) Sentence(org.cleartk.token.type.Sentence)

Aggregations

ArrayList (java.util.ArrayList)5 CAS (org.apache.uima.cas.CAS)5 Sentence (org.cleartk.token.type.Sentence)5 TopTreebankNode (org.cleartk.syntax.constituent.type.TopTreebankNode)4 Token (org.cleartk.token.type.Token)4 Tree (org.deeplearning4j.nn.layers.feedforward.autoencoder.recursive.Tree)3 TreebankNode (org.cleartk.syntax.constituent.type.TreebankNode)2 MultiDimensionalMap (org.deeplearning4j.util.MultiDimensionalMap)2 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)1 Pair (org.deeplearning4j.berkeley.Pair)1