Search in sources :

Example 1 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class KeywordLinkingEngineTest method testEngine.

/**
     * This tests if the Enhancements created by the Engine confirm to the
     * rules defined for the Stanbol Enhancement Structure.
     * @throws IOException
     * @throws EngineException
     */
@Test
public void testEngine() throws IOException, EngineException {
    EntityLinkerConfig linkerConfig = new EntityLinkerConfig();
    linkerConfig.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
    KeywordLinkingEngine engine = KeywordLinkingEngine.createInstance(openNLP, searcher, new TextAnalyzerConfig(), linkerConfig);
    engine.referencedSiteName = TEST_REFERENCED_SITE_NAME;
    ContentItem ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
    //tells the engine that this is an English text
    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("en")));
    //compute the enhancements
    engine.computeEnhancements(ci);
    //validate the enhancement results
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(engine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    //validate create fise:TextAnnotations
    int numTextAnnotations = validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
    assertEquals("Four fise:TextAnnotations are expected by this Test", 4, numTextAnnotations);
    //validate create fise:EntityAnnotations
    int numEntityAnnotations = validateAllEntityAnnotations(ci, expectedValues);
    assertEquals("Five fise:EntityAnnotations are expected by this Test", 5, numEntityAnnotations);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) EntityLinkerConfig(org.apache.stanbol.enhancer.engines.keywordextraction.impl.EntityLinkerConfig) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) HashMap(java.util.HashMap) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) KeywordLinkingEngine(org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine) TextAnalyzerConfig(org.apache.stanbol.commons.opennlp.TextAnalyzer.TextAnalyzerConfig) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 2 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class LanguageDetectionEngineTest method testNonTextContent.

@Test
public void testNonTextContent() throws EngineException, ConfigurationException, LangDetectException, IOException {
    LanguageDetectionEnhancementEngine langIdEngine = new LanguageDetectionEnhancementEngine();
    ComponentContext context = new MockComponentContext();
    context.getProperties().put(EnhancementEngine.PROPERTY_NAME, "langdetect");
    langIdEngine.activate(context);
    ContentItem ci = ciFactory.createContentItem(new StringSource("123"));
    langIdEngine.computeEnhancements(ci);
}
Also used : ComponentContext(org.osgi.service.component.ComponentContext) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 3 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class ContentItemReader method readFrom.

@Override
public ContentItem readFrom(Class<ContentItem> type, Type genericType, Annotation[] annotations, MediaType mediaType, MultivaluedMap<String, String> httpHeaders, InputStream entityStream) throws IOException, WebApplicationException {
    //boolean withMetadata = withMetadata(httpHeaders);
    ContentItem contentItem = null;
    IRI contentItemId = getContentItemId();
    if (log.isTraceEnabled()) {
        //NOTE: enabling TRACE level logging will copy the parsed content
        //      into a BYTE array
        log.trace("Parse ContentItem from");
        log.trace("  - MediaType: {}", mediaType);
        log.trace("  - Headers:");
        for (Entry<String, List<String>> header : httpHeaders.entrySet()) {
            log.trace("      {}: {}", header.getKey(), header.getValue());
        }
        byte[] content = IOUtils.toByteArray(entityStream);
        log.trace("content: \n{}", new String(content, "UTF-8"));
        IOUtils.closeQuietly(entityStream);
        entityStream = new ByteArrayInputStream(content);
    }
    Set<String> parsedContentIds = new HashSet<String>();
    if (mediaType.isCompatible(MULTIPART)) {
        log.debug(" - parse Multipart MIME ContentItem");
        //try to read ContentItem from "multipart/from-data"
        Graph metadata = null;
        FileItemIterator fileItemIterator;
        try {
            fileItemIterator = fu.getItemIterator(new MessageBodyReaderContext(entityStream, mediaType));
            while (fileItemIterator.hasNext()) {
                FileItemStream fis = fileItemIterator.next();
                if (fis.getFieldName().equals("metadata")) {
                    if (contentItem != null) {
                        throw new WebApplicationException(Response.status(Response.Status.BAD_REQUEST).entity("The Multipart MIME part with the 'metadata' " + "MUST BE before the MIME part containing the " + "'content'!").build());
                    }
                    //only used if not parsed as query param
                    if (contentItemId == null && fis.getName() != null && !fis.getName().isEmpty()) {
                        contentItemId = new IRI(fis.getName());
                    }
                    metadata = new IndexedGraph();
                    try {
                        getParser().parse(metadata, fis.openStream(), fis.getContentType());
                    } catch (Exception e) {
                        throw new WebApplicationException(e, Response.status(Response.Status.BAD_REQUEST).entity(String.format("Unable to parse Metadata " + "from Multipart MIME part '%s' (" + "contentItem: %s| contentType: %s)", fis.getFieldName(), fis.getName(), fis.getContentType())).build());
                    }
                } else if (fis.getFieldName().equals("content")) {
                    contentItem = createContentItem(contentItemId, metadata, fis, parsedContentIds);
                } else if (fis.getFieldName().equals("properties") || fis.getFieldName().equals(REQUEST_PROPERTIES_URI.getUnicodeString())) {
                    //parse the RequestProperties
                    if (contentItem == null) {
                        throw new WebApplicationException(Response.status(Response.Status.BAD_REQUEST).entity("Multipart MIME parts for " + "Request Properties MUST BE after the " + "MIME parts for 'metadata' AND 'content'").build());
                    }
                    MediaType propMediaType = MediaType.valueOf(fis.getContentType());
                    if (!APPLICATION_JSON_TYPE.isCompatible(propMediaType)) {
                        throw new WebApplicationException(Response.status(Response.Status.BAD_REQUEST).entity("Request Properties (Multipart MIME parts" + "with the name '" + fis.getFieldName() + "') MUST " + "BE encoded as 'appicaltion/json' (encountered: '" + fis.getContentType() + "')!").build());
                    }
                    String propCharset = propMediaType.getParameters().get("charset");
                    if (propCharset == null) {
                        propCharset = "UTF-8";
                    }
                    Map<String, Object> reqProp = ContentItemHelper.initRequestPropertiesContentPart(contentItem);
                    try {
                        reqProp.putAll(toMap(new JSONObject(IOUtils.toString(fis.openStream(), propCharset))));
                    } catch (JSONException e) {
                        throw new WebApplicationException(e, Response.status(Response.Status.BAD_REQUEST).entity("Unable to parse Request Properties from" + "Multipart MIME parts with the name 'properties'!").build());
                    }
                } else {
                    //additional metadata as serialised RDF
                    if (contentItem == null) {
                        throw new WebApplicationException(Response.status(Response.Status.BAD_REQUEST).entity("Multipart MIME parts for additional " + "contentParts MUST BE after the MIME " + "parts for 'metadata' AND 'content'").build());
                    }
                    if (fis.getFieldName() == null || fis.getFieldName().isEmpty()) {
                        throw new WebApplicationException(Response.status(Response.Status.BAD_REQUEST).entity("Multipart MIME parts representing " + "ContentParts for additional RDF metadata" + "MUST define the contentParts URI as" + "'name' of the MIME part!").build());
                    }
                    Graph graph = new IndexedGraph();
                    try {
                        getParser().parse(graph, fis.openStream(), fis.getContentType());
                    } catch (Exception e) {
                        throw new WebApplicationException(e, Response.status(Response.Status.BAD_REQUEST).entity(String.format("Unable to parse RDF " + "for ContentPart '%s' ( contentType: %s)", fis.getName(), fis.getContentType())).build());
                    }
                    IRI contentPartId = new IRI(fis.getFieldName());
                    contentItem.addPart(contentPartId, graph);
                }
            }
            if (contentItem == null) {
                throw new WebApplicationException(Response.status(Response.Status.BAD_REQUEST).entity("The parsed multipart content item does not contain " + "any content. The content is expected to be contained " + "in a MIME part with the name 'content'. This part can " + " be also a 'multipart/alternate' if multiple content " + "parts need to be included in requests.").build());
            }
        } catch (FileUploadException e) {
            throw new WebApplicationException(e, Response.Status.BAD_REQUEST);
        }
    } else {
        //normal content
        ContentItemFactory ciFactory = getContentItemFactory();
        contentItem = ciFactory.createContentItem(contentItemId, new StreamSource(entityStream, mediaType.toString()));
        //add the URI of the main content
        parsedContentIds.add(contentItem.getPartUri(0).getUnicodeString());
    }
    //set the parsed contentIDs to the EnhancementProperties
    Map<String, Object> ep = ContentItemHelper.initRequestPropertiesContentPart(contentItem);
    parseEnhancementPropertiesFromParameters(ep);
    ep.put(PARSED_CONTENT_URIS, Collections.unmodifiableSet(parsedContentIds));
    //STANBOL-660: set the language of the content if explicitly parsed in the request
    String contentLanguage = getContentLanguage();
    if (!StringUtils.isBlank(contentLanguage)) {
        //language codes are case insensitive ... so we convert to lower case
        contentLanguage = contentLanguage.toLowerCase(Locale.ROOT);
        createParsedLanguageAnnotation(contentItem, contentLanguage);
    // previously only the dc:language property was set to the contentItem. However this
    // information is only used as fallback if no Language annotation is present. However
    // if a user explicitly parses the language he expects this language to be used
    // so this was change with STANBOL-1417
    //            EnhancementEngineHelper.set(contentItem.getMetadata(), contentItem.getUri(), 
    //                DC_LANGUAGE, new PlainLiteralImpl(contentLanguage));
    }
    return contentItem;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) ContentItemFactory(org.apache.stanbol.enhancer.servicesapi.ContentItemFactory) WebApplicationException(javax.ws.rs.WebApplicationException) StreamSource(org.apache.stanbol.enhancer.servicesapi.impl.StreamSource) JSONException(org.codehaus.jettison.json.JSONException) URISyntaxException(java.net.URISyntaxException) WebApplicationException(javax.ws.rs.WebApplicationException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) JSONException(org.codehaus.jettison.json.JSONException) FileUploadException(org.apache.commons.fileupload.FileUploadException) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Graph(org.apache.clerezza.commons.rdf.Graph) JSONObject(org.codehaus.jettison.json.JSONObject) ByteArrayInputStream(java.io.ByteArrayInputStream) FileItemStream(org.apache.commons.fileupload.FileItemStream) MediaType(javax.ws.rs.core.MediaType) List(java.util.List) ArrayList(java.util.ArrayList) JSONObject(org.codehaus.jettison.json.JSONObject) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) FileItemIterator(org.apache.commons.fileupload.FileItemIterator) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) FileUploadException(org.apache.commons.fileupload.FileUploadException) HashSet(java.util.HashSet)

Example 4 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testMp4.

/**
     * Tests mappings for the Mp4 metadata extraction capabilities added to
     * Tika 1.1 (STANBOL-627)
     * @throws EngineException
     * @throws IOException
     * @throws ParseException
     */
@Test
public void testMp4() throws EngineException, IOException, ParseException {
    log.info(">>> testMp4 <<<");
    ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "Test Title", "Test Artist", "Test Album");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    //Test AudioTrack metadata
    BlankNodeOrIRI audioTrack = verifyBlankNodeOrIRI(ci, new IRI(NamespaceEnum.media + "hasTrack"));
    //types
    verifyValues(ci, audioTrack, RDF.type, new IRI(NamespaceEnum.media + "MediaFragment"), new IRI(NamespaceEnum.media + "Track"), new IRI(NamespaceEnum.media + "AudioTrack"));
    //properties
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasFormat"), XSD.string, "Stereo");
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "samplingRate"), XSD.int_, "44100");
    verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasCompression"), XSD.string, "M4A");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 5 with ContentItem

use of org.apache.stanbol.enhancer.servicesapi.ContentItem in project stanbol by apache.

the class TikaEngineTest method testExifMetadata.

@Test
public void testExifMetadata() throws EngineException, ParseException, IOException {
    log.info(">>> testExifMetadata <<<");
    String exif = "http://www.semanticdesktop.org/ontologies/2007/05/10/nexif#";
    ContentItem ci = createContentItem("testJPEG_EXIF.jpg", "image/jpeg");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    verifyValue(ci, new IRI(exif + "make"), null, "Canon");
    verifyValue(ci, new IRI(exif + "software"), null, "Adobe Photoshop CS3 Macintosh");
    verifyValue(ci, new IRI(exif + "dateTimeOriginal"), XSD.dateTime, "2009-08-11T09:09:45");
    verifyValue(ci, new IRI(exif + "relatedImageWidth"), XSD.int_, "100");
    verifyValue(ci, new IRI(exif + "fNumber"), XSD.double_, "5.6");
    verifyValue(ci, new IRI(exif + "model"), null, "Canon EOS 40D");
    verifyValue(ci, new IRI(exif + "isoSpeedRatings"), XSD.int_, "400");
    verifyValue(ci, new IRI(exif + "xResolution"), XSD.double_, "240.0");
    verifyValue(ci, new IRI(exif + "flash"), XSD.boolean_, "false");
    verifyValue(ci, new IRI(exif + "exposureTime"), XSD.double_, "6.25E-4");
    verifyValue(ci, new IRI(exif + "yResolution"), XSD.double_, "240.0");
    verifyValue(ci, new IRI(exif + "resolutionUnit"), XSD.string, "Inch");
    verifyValue(ci, new IRI(exif + "focalLength"), XSD.double_, "194.0");
    verifyValue(ci, new IRI(exif + "relatedImageLength"), XSD.int_, "68");
    verifyValue(ci, new IRI(exif + "bitsPerSample"), XSD.int_, "8");
    //also Media Ontology mappings for Exif
    verifyValue(ci, new IRI(NamespaceEnum.media + "frameHeight"), XSD.int_, "68");
    verifyValue(ci, new IRI(NamespaceEnum.media + "frameWidth"), XSD.int_, "100");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasFormat"), null, "image/jpeg");
    verifyValue(ci, new IRI(NamespaceEnum.media + "creationDate"), XSD.dateTime, "2009-08-11T09:09:45");
    verifyValues(ci, new IRI(NamespaceEnum.media + "hasKeyword"), null, "serbor", "moscow-birds", "canon-55-250");
    //and finally the mapped DC properties
    verifyValue(ci, new IRI(NamespaceEnum.dc + "format"), null, "image/jpeg");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "created"), XSD.dateTime, "2009-08-11T09:09:45");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "modified"), XSD.dateTime, "2009-10-02T23:02:49");
    verifyValues(ci, new IRI(NamespaceEnum.dc + "subject"), null, "serbor", "moscow-birds", "canon-55-250");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)73 Test (org.junit.Test)62 IRI (org.apache.clerezza.commons.rdf.IRI)46 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)18 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)18 HashMap (java.util.HashMap)15 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)13 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)12 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 Graph (org.apache.clerezza.commons.rdf.Graph)8 Date (java.util.Date)6 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)6 Hashtable (java.util.Hashtable)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 MediaType (javax.ws.rs.core.MediaType)3 Triple (org.apache.clerezza.commons.rdf.Triple)3