Search in sources :

Example 1 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class FstLinkingEngine method writeEnhancements.

/**
     * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
     * extracted from the parsed ContentItem
     * @param ci
     * @param tags
     * @param language
     */
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    for (Tag tag : tags) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
        //first create the TextAnnotations for the Occurrences
        Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
        Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
        //search for existing text annotation
        Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
        IRI textAnnotation = null;
        while (it.hasNext()) {
            Triple t = it.next();
            if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
                textAnnotation = (IRI) t.getSubject();
                break;
            }
        }
        if (textAnnotation == null) {
            //not found ... create a new one
            textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
        } else {
            //if existing add this engine as contributor
            metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
        }
        //add dc:types (even to existing)
        for (IRI dcType : getDcTypes(tag.getSuggestions())) {
            metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
        }
        textAnnotations.add(textAnnotation);
        //now the EntityAnnotations for the Suggestions
        for (Match match : tag.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            //should we use the label used for the match, or search the
            //representation for the best label ... currently its the matched one
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
            metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
            for (IRI type : match.getTypes()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
            //add the relation to the fise:TextAnnotation (the tag)
            metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            //write origin information
            if (indexConfig.getOrigin() != null) {
                metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
            }
            //                }
            if (writeRankings) {
                Double ranking = match.getRanking();
                if (ranking != null) {
                    metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
                }
            }
        //TODO: dereferencing 
        //                if(linkerConfig.isDereferenceEntitiesEnabled() &&
        //                        dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
        //                    //add all outgoing triples for this entity
        //                    //NOTE: do not add all triples as there might be other data in the graph
        //                    for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
        //                            triples.hasNext();metadata.add(triples.next()));
        //                }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 2 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class TestHtmlExtractor method testMicrodataExtraction.

/** This test some extraction of microdata from an HTML-5 document
     * 
     * @throws Exception
     */
@Test
public void testMicrodataExtraction() throws Exception {
    HtmlExtractor extractor = new HtmlExtractor(registry, parser);
    Graph model = new SimpleGraph();
    String testFile = "test-microdata.html";
    // extract text from RDFa annotated html
    InputStream in = getResourceAsStream(testFile);
    assertNotNull("failed to load resource " + testFile, in);
    extractor.extract("file://" + testFile, in, null, "text/html", model);
    // show triples
    int tripleCounter = model.size();
    LOG.debug("Microdata triples: {}", tripleCounter);
    printTriples(model);
    assertEquals(91, tripleCounter);
    ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) InputStream(java.io.InputStream) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) HtmlExtractor(org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor) Test(org.junit.Test)

Example 3 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class TestHtmlExtractor method testMFExtraction.

/** This tests some Microformat extraction
     * 
     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
@Test
public void testMFExtraction() throws Exception {
    HtmlExtractor extractor = new HtmlExtractor(registry, parser);
    Graph model = new SimpleGraph();
    String testFile = "test-MF.html";
    // extract text from RDFa annotated html
    InputStream in = getResourceAsStream(testFile);
    assertNotNull("failed to load resource " + testFile, in);
    extractor.extract("file://" + testFile, in, null, "text/html", model);
    // show triples
    int tripleCounter = model.size();
    LOG.debug("Microformat triples: {}", tripleCounter);
    printTriples(model);
    assertEquals(127, tripleCounter);
    ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) InputStream(java.io.InputStream) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) HtmlExtractor(org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor) Test(org.junit.Test)

Example 4 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class LangIdEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
        return;
    }
    // truncate text to some piece from the middle if probeLength > 0
    int checkLength = probeLength;
    if (checkLength > 0 && text.length() > checkLength) {
        text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
    }
    LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
    String language = languageIdentifier.getLanguage();
    log.info("language identified as " + language);
    // add language to metadata
    Graph g = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
        g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
        g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) Graph(org.apache.clerezza.commons.rdf.Graph) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) IOException(java.io.IOException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 5 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class LanguageDetectionEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    //do not call trim() on long texts to check if the text is empty
    if (text.length() < 50 && text.trim().length() == 0) {
        log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
        return;
    }
    // truncate text to some piece from the middle if probeLength > 0
    int checkLength = probeLength;
    if (checkLength > 0 && text.length() > checkLength) {
        text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
    }
    List<Language> languages = null;
    try {
        languages = languageIdentifier.getLanguages(text);
        log.debug("language identified: {}", languages);
    } catch (LangDetectException e) {
        Enum<?> errorCode = e.getCode();
        //ignore " 0 - NoTextError" and "5 - CantDetectError"
        if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
            StringBuilder msg = new StringBuilder("Could not identify language of text: ");
            if (text.length() < 200) {
                msg.append(text);
            } else {
                msg.append(text.subSequence(0, 199)).append("...");
            }
            msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
            throw new EngineException(this, ci, msg.toString(), e);
        } else {
            log.debug("No text to detect the language from present in ContentItem ", ci);
        }
    }
    // add language to metadata
    if (languages != null) {
        Graph g = ci.getMetadata();
        ci.getLock().writeLock().lock();
        try {
            for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
                // add a hypothesis
                Language hypothesis = languages.get(i);
                IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
                g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
                g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(com.cybozu.labs.langdetect.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LangDetectException(com.cybozu.labs.langdetect.LangDetectException)

Aggregations

Graph (org.apache.clerezza.commons.rdf.Graph)172 IRI (org.apache.clerezza.commons.rdf.IRI)110 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)66 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)57 Triple (org.apache.clerezza.commons.rdf.Triple)45 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)43 Test (org.junit.Test)38 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)36 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)34 IOException (java.io.IOException)27 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)26 HashSet (java.util.HashSet)24 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)24 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)24 InputStream (java.io.InputStream)21 HashMap (java.util.HashMap)20 Language (org.apache.clerezza.commons.rdf.Language)17 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)17 ArrayList (java.util.ArrayList)16 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)15