Search in sources :

Example 31 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class OpenCalaisEngine method createEnhancements.

/**
 * This generates enhancement structures for the entities from OpenCalais
 * and adds them to the content item's metadata.
 * For each entity a TextAnnotation and an EntityAnnotation are created.
 * An EntityAnnotation can relate to several TextAnnotations.
 *
 * @param occs a Collection of entity information
 * @param ci the content item
 */
public void createEnhancements(Collection<CalaisEntityOccurrence> occs, ContentItem ci) {
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    // used for plain literals representing parts fo the content
    final Language language;
    String langString = EnhancementEngineHelper.getLanguage(ci);
    if (langString != null && !langString.isEmpty()) {
        language = new Language(langString);
    } else {
        language = null;
    }
    // TODO create TextEnhancement (form, start, end, type?) and EntityAnnotation (id, name, type)
    HashMap<RDFTerm, IRI> entityAnnotationMap = new HashMap<RDFTerm, IRI>();
    for (CalaisEntityOccurrence occ : occs) {
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        Graph model = ci.getMetadata();
        model.add(new TripleImpl(textAnnotation, DC_TYPE, occ.type));
        // for autotagger use the name instead of the matched term (that might be a pronoun!)
        if (onlyNERMode) {
            model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.name, language)));
        } else {
            model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.exact, language)));
        }
        model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset)));
        model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.length)));
        model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context, language)));
        // use the relevance as confidence
        if (occ.relevance != null && Double.valueOf(0).compareTo(occ.relevance) <= 0) {
            // we do not know if the relevance is available (may be NULL)
            // or the relevance feature is activated (may be -1)
            model.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occ.relevance)));
        }
        // create EntityAnnotation only once but add a reference to the textAnnotation
        if (entityAnnotationMap.containsKey(occ.id)) {
            model.add(new TripleImpl(entityAnnotationMap.get(occ.id), DC_RELATION, textAnnotation));
        } else {
            if (onlyNERMode) {
                // don't create Calais specific entity annotations; let the autotagger do its's own
                // but add a pointer to the first text annotation with that name
                entityAnnotationMap.put(occ.id, textAnnotation);
            } else {
            // IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // entityAnnotationMap.put(occ.id, entityAnnotation);
            // model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
            // model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, occ.name));
            // model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, occ.type));
            // model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.id));
            }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) HashMap(java.util.HashMap) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory)

Example 32 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class RdfRepresentationTest method testPlainLiteralToTextConversion.

/*--------------------------------------------------------------------------
     * Additional Tests for special Features of the Clerezza based implementation
     * 
     * This includes mainly support for additional types like PlainLiteral,
     * TypedLiteral, IRIs. The conversion to such types as well as getter for
     * such types.
     *--------------------------------------------------------------------------
     */
/**
 * {@link PlainLiteral} is used for natural language text in the Clerezza
 * RDF API. This tests if adding {@link PlainLiteral}s to the
 * {@link Representation#add(String, Object)} method makes them available
 * as {@link Text} instances via the {@link Representation} API (e.g.
 * {@link Representation#get(String, String...)}).
 */
@Test
public void testPlainLiteralToTextConversion() {
    String field = "urn:test.RdfRepresentation:test.field";
    Literal noLangLiteral = new PlainLiteralImpl("A plain literal without Language");
    Literal enLiteral = new PlainLiteralImpl("An english literal", new Language("en"));
    Literal deLiteral = new PlainLiteralImpl("Ein Deutsches Literal", new Language("de"));
    Literal deATLiteral = new PlainLiteralImpl("Ein Topfen Verband hilft bei Zerrungen", new Language("de-AT"));
    Collection<Literal> plainLiterals = Arrays.asList(noLangLiteral, enLiteral, deLiteral, deATLiteral);
    Representation rep = createRepresentation(null);
    rep.add(field, plainLiterals);
    // now test, that the Plain Literals are available as natural language
    // tests via the Representation Interface!
    // 1) one without a language
    Iterator<Text> noLangaugeTexts = rep.get(field, (String) null);
    assertTrue(noLangaugeTexts.hasNext());
    Text noLanguageText = noLangaugeTexts.next();
    assertEquals(noLangLiteral.getLexicalForm(), noLanguageText.getText());
    assertNull(noLanguageText.getLanguage());
    // only a single result
    assertFalse(noLangaugeTexts.hasNext());
    // 2) one with a language
    Iterator<Text> enLangaugeTexts = rep.get(field, "en");
    assertTrue(enLangaugeTexts.hasNext());
    Text enLangageText = enLangaugeTexts.next();
    assertEquals(enLiteral.getLexicalForm(), enLangageText.getText());
    assertEquals(enLiteral.getLanguage().toString(), enLangageText.getLanguage());
    // only a single result
    assertFalse(enLangaugeTexts.hasNext());
    // 3) test to get all natural language values
    Set<String> stringValues = new HashSet<String>();
    for (Literal plainLiteral : plainLiterals) {
        stringValues.add(plainLiteral.getLexicalForm());
    }
    Iterator<Text> texts = rep.getText(field);
    while (texts.hasNext()) {
        assertTrue(stringValues.remove(texts.next().getText()));
    }
    assertTrue(stringValues.isEmpty());
}
Also used : Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Literal(org.apache.clerezza.commons.rdf.Literal) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) HashSet(java.util.HashSet) RepresentationTest(org.apache.stanbol.entityhub.test.model.RepresentationTest) Test(org.junit.Test)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4