Search in sources :

Example 86 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class CeliLanguageIdentifierEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    try {
        String[] tmps = text.split(" ");
        List<GuessedLanguage> lista = null;
        if (tmps.length > 5)
            lista = this.client.guessLanguage(text);
        else
            lista = this.client.guessQueryLanguage(text);
        Graph g = ci.getMetadata();
        //in ENHANCE_ASYNC we need to use read/write locks on the ContentItem
        ci.getLock().writeLock().lock();
        try {
            GuessedLanguage gl = lista.get(0);
            IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
            g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(gl.getLang())));
            g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(gl.getConfidence())));
            g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI language" + " identifier service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI language identifier service!", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) SOAPException(javax.xml.soap.SOAPException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 87 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class DBPSpotlightDisambiguateEnhancementEngine method createEnhancements.

/**
	 * The method adds the returned DBpedia Spotlight annotations to the content
	 * item's metadata. For each DBpedia resource an EntityAnnotation is created
	 * and linked to the according TextAnnotation.
	 * 
	 * @param occs
	 *            a Collection of entity information
	 * @param ci
	 *            the content item
	 */
public void createEnhancements(Collection<Annotation> occs, ContentItem ci, Language language) {
    HashMap<RDFTerm, IRI> entityAnnotationMap = new HashMap<RDFTerm, IRI>();
    for (Annotation occ : occs) {
        if (textAnnotationsMap.get(occ.surfaceForm) != null) {
            IRI textAnnotation = textAnnotationsMap.get(occ.surfaceForm);
            Graph model = ci.getMetadata();
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            entityAnnotationMap.put(occ.uri, entityAnnotation);
            Literal label = new PlainLiteralImpl(occ.surfaceForm.name, language);
            model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
            model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, label));
            Collection<String> t = occ.getTypeNames();
            if (t != null) {
                Iterator<String> it = t.iterator();
                while (it.hasNext()) model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, new IRI(it.next())));
            }
            model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.uri));
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) HashMap(java.util.HashMap) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Literal(org.apache.clerezza.commons.rdf.Literal) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Annotation(org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation)

Example 88 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class TestClerezzaInputSources method checkOntology.

private void checkOntology(boolean usesTcProvider) throws Exception {
    assertNotNull(src);
    if (usesTcProvider)
        assertNotNull(src.getOrigin());
    else
        assertNull(src.getOrigin());
    Graph o = src.getRootOntology();
    assertNotNull(o);
    log.info("Ontology loaded, is a {}", o.getClass().getCanonicalName());
    // The owl:Ontology declaration and versionInfo also count as triples.
    assertSame(5, o.size());
}
Also used : Graph(org.apache.clerezza.commons.rdf.Graph)

Example 89 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class HtmlExtractorEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
    Graph model = new SimpleGraph();
    ci.getLock().readLock().lock();
    try {
        extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), model);
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    ClerezzaRDFUtils.urifyBlankNodes(model);
    // make the model single rooted
    if (singleRootRdf) {
        ClerezzaRDFUtils.makeConnected(model, ci.getUri(), new IRI(NIE_NS + "contains"));
    }
    //add the extracted triples to the metadata of the ContentItem
    ci.getLock().writeLock().lock();
    try {
        LOG.info("Model: {}", model);
        ci.getMetadata().addAll(model);
        model = null;
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) ExtractorException(org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) HtmlExtractor(org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor)

Example 90 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class HtmlExtractor method main.

public static void main(String[] args) throws Exception {
    int argv = 0;
    HtmlExtractor inst = new HtmlExtractor();
    for (int i = argv; i < args.length; ++i) {
        File file = new File(args[i]);
        InputStream input = new FileInputStream(file);
        Charset charset = Charset.forName("UTF-8");
        String mimeType = "text/html";
        IRI uri = new IRI(file.toURI().toString());
        Graph container = new SimpleGraph();
        inst.extract(uri.getUnicodeString(), input, charset, mimeType, container);
        System.out.println("Model for " + args[i]);
        //TODO
        //            container.writeTo(System.out);
        System.out.println();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Charset(java.nio.charset.Charset) File(java.io.File) FileInputStream(java.io.FileInputStream)

Aggregations

Graph (org.apache.clerezza.commons.rdf.Graph)172 IRI (org.apache.clerezza.commons.rdf.IRI)110 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)66 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)57 Triple (org.apache.clerezza.commons.rdf.Triple)45 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)43 Test (org.junit.Test)38 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)36 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)34 IOException (java.io.IOException)27 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)26 HashSet (java.util.HashSet)24 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)24 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)24 InputStream (java.io.InputStream)21 HashMap (java.util.HashMap)20 Language (org.apache.clerezza.commons.rdf.Language)17 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)17 ArrayList (java.util.ArrayList)16 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)15