Search in sources :

Example 6 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class CeliNamedEntityExtractionEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    // used for the palin literals in TextAnnotations
    Language lang = new Language(language);
    try {
        List<NamedEntity> lista = this.client.extractEntities(text, language);
        LiteralFactory literalFactory = LiteralFactory.getInstance();
        Graph g = ci.getMetadata();
        for (NamedEntity ne : lista) {
            try {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                // add selected text as PlainLiteral in the language extracted from the text
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(ne.getFormKind(), lang)));
                g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
                if (ne.getFrom() != null && ne.getTo() != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(ne.getFrom().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(ne.getTo().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
                }
            } catch (NoConvertorException e) {
                log.error(e.getMessage(), e);
            }
        }
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI NER (Named Entity Recognition)" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI NER (Named Entity Recognition) service!", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NoConvertorException(org.apache.clerezza.rdf.core.NoConvertorException) SOAPException(javax.xml.soap.SOAPException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 7 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class SpotlightEngineUtils method createEntityAnnotation.

/**
 * Creates a fise:EntityAnnotation for the parsed parameters and
 * adds it the the {@link ContentItem#getMetadata()}. <p>
 * This method assumes a write lock on the parsed content item.
 * @param resource the candidate resource
 * @param engine the engine
 * @param ci the content item
 * @param textAnnotation the fise:TextAnnotation to dc:relate the
 * created fise:EntityAnnotation
 * @return the URI of the created fise:TextAnnotation
 */
public static IRI createEntityAnnotation(CandidateResource resource, EnhancementEngine engine, ContentItem ci, IRI textAnnotation) {
    IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, engine);
    Graph model = ci.getMetadata();
    Literal label = new PlainLiteralImpl(resource.label, new Language("en"));
    model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
    model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, label));
    model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, resource.getUri()));
    model.add(new TripleImpl(entityAnnotation, PROPERTY_CONTEXTUAL_SCORE, literalFactory.createTypedLiteral(resource.contextualScore)));
    model.add(new TripleImpl(entityAnnotation, PROPERTY_PERCENTAGE_OF_SECOND_RANK, literalFactory.createTypedLiteral(resource.percentageOfSecondRank)));
    model.add(new TripleImpl(entityAnnotation, PROPERTY_SUPPORT, literalFactory.createTypedLiteral(resource.support)));
    model.add(new TripleImpl(entityAnnotation, PROPERTY_PRIOR_SCORE, literalFactory.createTypedLiteral(resource.priorScore)));
    model.add(new TripleImpl(entityAnnotation, PROPERTY_FINAL_SCORE, literalFactory.createTypedLiteral(resource.finalScore)));
    return entityAnnotation;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Literal(org.apache.clerezza.commons.rdf.Literal) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 8 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class DBPSpotlightSpotEnhancementEngine method computeEnhancements.

/**
 * Calculate the enhancements by doing a POST request to the DBpedia
 * Spotlight endpoint and processing the results
 *
 * @param ci
 *            the {@link ContentItem}
 */
public void computeEnhancements(ContentItem ci) throws EngineException {
    Language language = SpotlightEngineUtils.getContentLanguage(ci);
    String text = SpotlightEngineUtils.getPlainContent(ci);
    Collection<SurfaceForm> dbpslGraph = doPostRequest(text, ci.getUri());
    if (dbpslGraph != null) {
        // Acquire a write lock on the ContentItem when adding the
        // enhancements
        ci.getLock().writeLock().lock();
        try {
            createEnhancements(dbpslGraph, ci, text, language);
            if (log.isDebugEnabled()) {
                Serializer serializer = Serializer.getInstance();
                ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
                serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
                try {
                    log.debug("DBpedia Spotlight Spot Enhancements:\n{}", debugStream.toString("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : Language(org.apache.clerezza.commons.rdf.Language) SurfaceForm(org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Serializer(org.apache.clerezza.rdf.core.serializedform.Serializer)

Example 9 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class ResultSetToXml method createValueElement.

private Element createValueElement(RDFTerm resource, Document doc) {
    Element value;
    if (resource instanceof IRI) {
        value = doc.createElement("uri");
        value.appendChild(doc.createTextNode(((IRI) resource).getUnicodeString()));
    } else if (resource instanceof Literal) {
        value = doc.createElement("literal");
        value.appendChild(doc.createTextNode(((Literal) resource).getLexicalForm()));
        value.setAttribute("datatype", (((Literal) resource).getDataType().getUnicodeString()));
        Language lang = ((Literal) resource).getLanguage();
        if (lang != null) {
            value.setAttribute("xml:lang", (lang.toString()));
        }
    } else {
        value = doc.createElement("bnode");
        value.appendChild(doc.createTextNode(resource.toString()));
    }
    return value;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Language(org.apache.clerezza.commons.rdf.Language) Element(org.w3c.dom.Element) Literal(org.apache.clerezza.commons.rdf.Literal)

Example 10 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class EnhancementEngineHelperTest method testTextAnnotationNewModel.

@Test
public void testTextAnnotationNewModel() {
    String content = "The Stanbol Enhancer can extract Entities form parsed Text.";
    Language lang = new Language("en");
    int start = content.indexOf("Stanbol");
    int end = start + "Stanbol Enhancer".length();
    IRI ciUri = new IRI("http://www.example.org/contentItem#1");
    Graph metadata = new IndexedGraph();
    IRI ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
    EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
    Assert.assertEquals("The ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
    Assert.assertEquals("Stanbol Enhancer", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
    Assert.assertEquals(" can extra", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
    Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
    Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
    // head and tail should be null
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
    content = "Ich habe den Schlüssel fürs Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss verlohren.";
    start = content.indexOf("Donaudampfschi");
    end = content.indexOf(" verlohren");
    ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
    EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
    Assert.assertEquals("ssel fürs ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
    Assert.assertEquals(" verlohren", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
    Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
    Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
    // selected text is expected to be null
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
    // tail and head should be present
    Assert.assertEquals("Donaudampf", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
    Assert.assertEquals("tenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
    // finally the same but deactivating head/tail
    ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
    EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, false);
    Assert.assertEquals("Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
}
Also used : BigInteger(java.math.BigInteger) IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Language(org.apache.clerezza.commons.rdf.Language) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Test(org.junit.Test)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4