Search in sources :

Example 41 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class DereferenceEngineTest method testOfflineMode.

/**
     * Test {@link OfflineMode} functionality
     * @throws Exception
     */
@Test
public void testOfflineMode() throws Exception {
    ContentItem ci = getContentItem("urn:test:testOfflineMode");
    EntityDereferencer onlineDereferencer = new TestDereferencer(null) {

        @Override
        public boolean supportsOfflineMode() {
            return false;
        }
    };
    Dictionary<String, Object> dict = new Hashtable<String, Object>();
    dict.put(EnhancementEngine.PROPERTY_NAME, "online");
    dict.put(FILTER_CONTENT_LANGUAGES, false);
    dict.put(FILTER_ACCEPT_LANGUAGES, false);
    EntityDereferenceEngine engine = new EntityDereferenceEngine(onlineDereferencer, new DereferenceEngineConfig(dict, null));
    //engine in online mode
    Assert.assertNotEquals(engine.canEnhance(ci), EnhancementEngine.CANNOT_ENHANCE);
    //set engine in offline mode
    engine.setOfflineMode(true);
    Assert.assertEquals(engine.canEnhance(ci), EnhancementEngine.CANNOT_ENHANCE);
}
Also used : Hashtable(java.util.Hashtable) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 42 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TestLocationEnhancementEngine method testLocationEnhancementEngine.

@Test
public void testLocationEnhancementEngine() throws IOException, EngineException {
    //create a content item
    ContentItem ci = getContentItem("urn:org.apache:stanbol.enhancer:text:content-item:person", CONTEXT);
    //add three text annotations to be consumed by this test
    getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
    getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
    getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
    //perform the computation of the enhancements
    try {
        locationEnhancementEngine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e, "overloaded with requests");
        return;
    }
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(locationEnhancementEngine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    /*
         * Note:
         *  - Expected results depend on the geonames.org data. So if the test
         *    fails it may also mean that the data provided by geonames.org have
         *    changed
         */
    int entityAnnotationCount = validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
//two suggestions for New Zealand and one hierarchy entry for the first
//suggestion
//NOTE 2012-10-10: changed expected value back to "3" as geonames.org
//   again returns "Oceania" as parent for "New Zealand"
//NOTE: 2012-11-12: deactivated this check, because this the fact that
//   "Oceania" is returned as parent for "New Zealand" changes every
//   every view weeks
//assertEquals(3, entityAnnotationCount);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) HashMap(java.util.HashMap) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 43 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class LanguageDetectionEngineTest method testEngine.

/**
     * Test the engine and validates the created enhancements
     * @throws EngineException
     * @throws IOException
     * @throws ConfigurationException
     * @throws LangDetectException 
     */
@Test
public void testEngine() throws EngineException, ConfigurationException, LangDetectException, IOException {
    LOG.info("Testing engine: {}", TEST_FILE_NAMES[0]);
    InputStream in = LanguageDetectionEngineTest.class.getClassLoader().getResourceAsStream(TEST_FILE_NAMES[0]);
    assertNotNull("failed to load resource " + TEST_FILE_NAMES[0], in);
    String text = IOUtils.toString(in, "UTF-8");
    in.close();
    LanguageDetectionEnhancementEngine langIdEngine = new LanguageDetectionEnhancementEngine();
    ComponentContext context = new MockComponentContext();
    context.getProperties().put(EnhancementEngine.PROPERTY_NAME, "langdetect");
    langIdEngine.activate(context);
    ContentItem ci = ciFactory.createContentItem(new StringSource(text));
    langIdEngine.computeEnhancements(ci);
    HashMap<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(langIdEngine.getClass().getName()));
    int textAnnotationCount = validateAllTextAnnotations(ci.getMetadata(), text, expectedValues);
    assertTrue("A TextAnnotation is expected", textAnnotationCount > 0);
    //even through this tests do not validate detection quality
    //we expect the "en" is detected as best guess for the parsed text
    assertEquals("The detected language for text '" + text + "' MUST BE 'en'", "en", EnhancementEngineHelper.getLanguage(ci));
    int entityAnnoNum = validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
    assertEquals("No EntityAnnotations are expected", 0, entityAnnoNum);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) ComponentContext(org.osgi.service.component.ComponentContext) HashMap(java.util.HashMap) InputStream(java.io.InputStream) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 44 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class ContentItemReaderWriterTest method testReader.

@Test
public void testReader() throws Exception {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    MediaType contentType = serializeContentItem(out);
    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
    ContentItem ci = ciReader.readFrom(ContentItem.class, null, null, contentType, null, in);
    //assert ID
    assertEquals(contentItem.getUri(), ci.getUri());
    //assert metadata
    Graph copy = new SimpleGraph();
    copy.addAll(contentItem.getMetadata());
    assertTrue(copy.removeAll(ci.getMetadata()));
    assertTrue(copy.isEmpty());
    //assert Blob
    assertEquals(contentItem.getBlob().getMimeType(), ci.getBlob().getMimeType());
    String content = IOUtils.toString(contentItem.getStream(), "UTF-8");
    String readContent = IOUtils.toString(ci.getStream(), "UTF-8");
    assertEquals(content, readContent);
    Iterator<Entry<IRI, Blob>> contentItemBlobsIt = ContentItemHelper.getContentParts(contentItem, Blob.class).entrySet().iterator();
    Iterator<Entry<IRI, Blob>> ciBlobsIt = ContentItemHelper.getContentParts(ci, Blob.class).entrySet().iterator();
    //later used to validate enhancementMetadata
    Set<String> expectedParsedContentIds = new HashSet<String>();
    while (contentItemBlobsIt.hasNext() && ciBlobsIt.hasNext()) {
        Entry<IRI, Blob> contentItemBlobPart = contentItemBlobsIt.next();
        Entry<IRI, Blob> ciBlobPart = ciBlobsIt.next();
        expectedParsedContentIds.add(ciBlobPart.getKey().getUnicodeString());
        assertEquals(contentItemBlobPart.getKey(), ciBlobPart.getKey());
        String partContentType = contentItemBlobPart.getValue().getMimeType();
        String readPartContentType = ciBlobPart.getValue().getMimeType();
        assertEquals(partContentType, readPartContentType);
        String partContent = IOUtils.toString(contentItemBlobPart.getValue().getStream(), "UTF-8");
        String readPartContent = IOUtils.toString(ciBlobPart.getValue().getStream(), "UTF-8");
        assertEquals(partContent, readPartContent);
    }
    //validate ExecutionMetadata
    Graph executionMetadata = contentItem.getPart(ExecutionMetadata.CHAIN_EXECUTION, Graph.class);
    Graph readExecutionMetadata = ci.getPart(ExecutionMetadata.CHAIN_EXECUTION, Graph.class);
    assertNotNull(executionMetadata);
    assertNotNull(readExecutionMetadata);
    assertEquals(executionMetadata.size(), readExecutionMetadata.size());
    //validate EnhancemetnProperties
    Map<String, Object> reqProp = ContentItemHelper.getRequestPropertiesContentPart(ci);
    assertNotNull(reqProp);
    //the parsed value MUST BE overridden by the two content parts parsed
    assertEquals(expectedParsedContentIds, getParsedContentURIs(reqProp));
    Collection<String> outputContent = getOutputContent(reqProp);
    assertEquals(1, outputContent.size());
    assertEquals(outputContent.iterator().next(), "*/*");
    Collection<String> outputContentPart = Collections.singleton("*");
    assertEquals(1, outputContentPart.size());
    assertEquals(outputContentPart.iterator().next(), "*");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Entry(java.util.Map.Entry) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) ByteArrayInputStream(java.io.ByteArrayInputStream) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) MediaType(javax.ws.rs.core.MediaType) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 45 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class ContentItemReader method createContentItem.

/**
     * Creates a ContentItem
     * @param id the ID or <code>null</code> if not known
     * @param metadata the metadata or <code>null</code> if not parsed. NOTE that
     * if <code>id == null</code> also <code>metadata == null</code> and 
     * <code>id != null</code> also <code>metadata != null</code>.
     * @param content the {@link FileItemStream} of the MIME part representing
     * the content. If {@link FileItemStream#getContentType()} is compatible with
     * "multipart/*" than this will further parse for multiple parsed content
     * version. In any other case the contents of the parsed {@link FileItemStream}
     * will be directly add as content for the {@link ContentItem} created by
     * this method.
     * @param parsedContentParts used to add the IDs of parsed contentParts 
     * @return the created content item
     * @throws IOException on any error while accessing the contents of the parsed
     * {@link FileItemStream}
     * @throws FileUploadException if the parsed contents are not correctly
     * encoded Multipart MIME
     */
private ContentItem createContentItem(IRI id, Graph metadata, FileItemStream content, Set<String> parsedContentParts) throws IOException, FileUploadException {
    MediaType partContentType = MediaType.valueOf(content.getContentType());
    ContentItem contentItem = null;
    ContentItemFactory ciFactory = getContentItemFactory();
    if (MULTIPART.isCompatible(partContentType)) {
        log.debug("  - multiple (alternate) ContentParts");
        //multiple contentParts are parsed
        FileItemIterator contentPartIterator = fu.getItemIterator(new MessageBodyReaderContext(content.openStream(), partContentType));
        while (contentPartIterator.hasNext()) {
            FileItemStream fis = contentPartIterator.next();
            if (contentItem == null) {
                log.debug("  - create ContentItem {} for content (type:{})", id, fis.getContentType());
                contentItem = ciFactory.createContentItem(id, new StreamSource(fis.openStream(), fis.getContentType()), metadata);
            } else {
                log.debug("  - create Blob for content (type:{})", fis.getContentType());
                Blob blob = ciFactory.createBlob(new StreamSource(fis.openStream(), fis.getContentType()));
                IRI contentPartId = null;
                if (fis.getFieldName() != null && !fis.getFieldName().isEmpty()) {
                    contentPartId = new IRI(fis.getFieldName());
                } else {
                    //generating a random ID might break metadata 
                    //TODO maybe we should throw an exception instead
                    contentPartId = new IRI("urn:contentpart:" + randomUUID());
                }
                log.debug("    ... add Blob {} to ContentItem {} with content (type:{})", new Object[] { contentPartId, id, fis.getContentType() });
                contentItem.addPart(contentPartId, blob);
                parsedContentParts.add(contentPartId.getUnicodeString());
            }
        }
    } else {
        log.debug("  - create ContentItem {} for content (type:{})", id, content.getContentType());
        contentItem = ciFactory.createContentItem(id, new StreamSource(content.openStream(), content.getContentType()), metadata);
    }
    //add the URI of the main content to the parsed contentParts
    parsedContentParts.add(contentItem.getPartUri(0).getUnicodeString());
    return contentItem;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItemFactory(org.apache.stanbol.enhancer.servicesapi.ContentItemFactory) FileItemStream(org.apache.commons.fileupload.FileItemStream) StreamSource(org.apache.stanbol.enhancer.servicesapi.impl.StreamSource) MediaType(javax.ws.rs.core.MediaType) FileItemIterator(org.apache.commons.fileupload.FileItemIterator) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem)

Aggregations

ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)73 Test (org.junit.Test)62 IRI (org.apache.clerezza.commons.rdf.IRI)46 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)18 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)18 HashMap (java.util.HashMap)15 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)13 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)12 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 Graph (org.apache.clerezza.commons.rdf.Graph)8 Date (java.util.Date)6 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)6 Hashtable (java.util.Hashtable)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 MediaType (javax.ws.rs.core.MediaType)3 Triple (org.apache.clerezza.commons.rdf.Triple)3