Search in sources :

Example 21 with LiteralFactory

use of org.apache.clerezza.rdf.core.LiteralFactory in project stanbol by apache.

the class OpenCalaisEngine method createEnhancements.

/**
 * This generates enhancement structures for the entities from OpenCalais
 * and adds them to the content item's metadata.
 * For each entity a TextAnnotation and an EntityAnnotation are created.
 * An EntityAnnotation can relate to several TextAnnotations.
 *
 * @param occs a Collection of entity information
 * @param ci the content item
 */
public void createEnhancements(Collection<CalaisEntityOccurrence> occs, ContentItem ci) {
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    // used for plain literals representing parts fo the content
    final Language language;
    String langString = EnhancementEngineHelper.getLanguage(ci);
    if (langString != null && !langString.isEmpty()) {
        language = new Language(langString);
    } else {
        language = null;
    }
    // TODO create TextEnhancement (form, start, end, type?) and EntityAnnotation (id, name, type)
    HashMap<RDFTerm, IRI> entityAnnotationMap = new HashMap<RDFTerm, IRI>();
    for (CalaisEntityOccurrence occ : occs) {
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        Graph model = ci.getMetadata();
        model.add(new TripleImpl(textAnnotation, DC_TYPE, occ.type));
        // for autotagger use the name instead of the matched term (that might be a pronoun!)
        if (onlyNERMode) {
            model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.name, language)));
        } else {
            model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.exact, language)));
        }
        model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset)));
        model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.length)));
        model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context, language)));
        // use the relevance as confidence
        if (occ.relevance != null && Double.valueOf(0).compareTo(occ.relevance) <= 0) {
            // we do not know if the relevance is available (may be NULL)
            // or the relevance feature is activated (may be -1)
            model.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occ.relevance)));
        }
        // create EntityAnnotation only once but add a reference to the textAnnotation
        if (entityAnnotationMap.containsKey(occ.id)) {
            model.add(new TripleImpl(entityAnnotationMap.get(occ.id), DC_RELATION, textAnnotation));
        } else {
            if (onlyNERMode) {
                // don't create Calais specific entity annotations; let the autotagger do its's own
                // but add a pointer to the first text annotation with that name
                entityAnnotationMap.put(occ.id, textAnnotation);
            } else {
            // IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // entityAnnotationMap.put(occ.id, entityAnnotation);
            // model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
            // model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, occ.name));
            // model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, occ.type));
            // model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.id));
            }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) HashMap(java.util.HashMap) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory)

Example 22 with LiteralFactory

use of org.apache.clerezza.rdf.core.LiteralFactory in project stanbol by apache.

the class TestKuromojiNlpEngine method testEngine.

@Test
public void testEngine() throws EngineException {
    LiteralFactory lf = LiteralFactory.getInstance();
    Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
    engine.computeEnhancements(contentItem);
    // assert the results
    Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
    expected.put(Properties.DC_CREATOR, lf.createTypedLiteral(engine.getClass().getName()));
    expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
    Assert.assertEquals(16, EnhancementStructureHelper.validateAllTextAnnotations(contentItem.getMetadata(), text, expected));
    AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
    Assert.assertNotNull(at);
    List<Sentence> sentences = AnalysedTextUtils.asList(at.getSentences());
    Assert.assertNotNull(sentences);
    Assert.assertEquals(7, sentences.size());
    // TODO: values in the following arrays are based on the first run of the
    // engine. So this is only to detect changes in results. It can not validate
    // that the tokenization and NER detections are correct - sorry I do not
    // speak Japanese ...
    int[] expectedChunks = new int[] { 5, 3, 1, 0, 1, 2, 4 };
    int[] expectedTokens = new int[] { 25, 25, 25, 24, 33, 17, 32 };
    int sentIndex = 0;
    for (Sentence sent : sentences) {
        List<Chunk> sentenceNer = AnalysedTextUtils.asList(sent.getChunks());
        Assert.assertEquals(expectedChunks[sentIndex], sentenceNer.size());
        for (Chunk chunk : sentenceNer) {
            Value<NerTag> nerValue = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
            Assert.assertNotNull(nerValue);
            Assert.assertNotNull(nerValue.value().getType());
        }
        List<Token> tokens = AnalysedTextUtils.asList(sent.getTokens());
        Assert.assertEquals(expectedTokens[sentIndex], tokens.size());
        for (Token token : tokens) {
            Value<PosTag> posValue = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
            Assert.assertNotNull(posValue);
        }
        sentIndex++;
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Test(org.junit.Test)

Example 23 with LiteralFactory

use of org.apache.clerezza.rdf.core.LiteralFactory in project stanbol by apache.

the class ResourceAdapterTest method testFloat.

@Test
public void testFloat() {
    Graph graph = new IndexedGraph();
    IRI id = new IRI("http://www.example.org/test");
    IRI doubleTestField = new IRI("http://www.example.org/field/double");
    LiteralFactory lf = LiteralFactory.getInstance();
    graph.add(new TripleImpl(id, doubleTestField, lf.createTypedLiteral(Float.NaN)));
    graph.add(new TripleImpl(id, doubleTestField, lf.createTypedLiteral(Float.POSITIVE_INFINITY)));
    graph.add(new TripleImpl(id, doubleTestField, lf.createTypedLiteral(Float.NEGATIVE_INFINITY)));
    RdfValueFactory vf = new RdfValueFactory(graph);
    Representation r = vf.createRepresentation(id.getUnicodeString());
    Set<Float> expected = new HashSet<Float>(Arrays.asList(Float.NaN, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY));
    Iterator<Float> dit = r.get(doubleTestField.getUnicodeString(), Float.class);
    while (dit.hasNext()) {
        Float val = dit.next();
        Assert.assertNotNull(val);
        Assert.assertTrue(expected.remove(val));
    }
    Assert.assertTrue(expected.isEmpty());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) RdfValueFactory(org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 24 with LiteralFactory

use of org.apache.clerezza.rdf.core.LiteralFactory in project stanbol by apache.

the class ResourceAdapterTest method testDouble.

/**
 * Test related to STANBOL-698
 */
@Test
public void testDouble() {
    Graph graph = new IndexedGraph();
    IRI id = new IRI("http://www.example.org/test");
    IRI doubleTestField = new IRI("http://www.example.org/field/double");
    LiteralFactory lf = LiteralFactory.getInstance();
    graph.add(new TripleImpl(id, doubleTestField, lf.createTypedLiteral(Double.NaN)));
    graph.add(new TripleImpl(id, doubleTestField, lf.createTypedLiteral(Double.POSITIVE_INFINITY)));
    graph.add(new TripleImpl(id, doubleTestField, lf.createTypedLiteral(Double.NEGATIVE_INFINITY)));
    RdfValueFactory vf = new RdfValueFactory(graph);
    Representation r = vf.createRepresentation(id.getUnicodeString());
    Set<Double> expected = new HashSet<Double>(Arrays.asList(Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY));
    Iterator<Double> dit = r.get(doubleTestField.getUnicodeString(), Double.class);
    while (dit.hasNext()) {
        Double val = dit.next();
        Assert.assertNotNull(val);
        Assert.assertTrue(expected.remove(val));
    }
    Assert.assertTrue(expected.isEmpty());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) RdfValueFactory(org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)24 IRI (org.apache.clerezza.commons.rdf.IRI)22 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 Graph (org.apache.clerezza.commons.rdf.Graph)15 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)10 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)9 Language (org.apache.clerezza.commons.rdf.Language)8 Triple (org.apache.clerezza.commons.rdf.Triple)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 Literal (org.apache.clerezza.commons.rdf.Literal)4 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)4 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)4 ArrayList (java.util.ArrayList)3 Date (java.util.Date)3 Map (java.util.Map)3 SOAPException (javax.xml.soap.SOAPException)3 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)3