Search in sources :

Example 91 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveMetadataResourceTest method testHandlerType.

@Test
public void testHandlerType() throws Exception {
    //default unspecified
    Response response = WebClient.create(endPoint + META_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //extra slash
    response = WebClient.create(endPoint + META_PATH + SLASH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //unparseable
    response = WebClient.create(endPoint + META_PATH + UNPARSEABLE_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //xml
    response = WebClient.create(endPoint + META_PATH + XML_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
    //text
    response = WebClient.create(endPoint + META_PATH + TEXT_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
    assertTrue(content.startsWith("embed_3"));
    //ignore
    response = WebClient.create(endPoint + META_PATH + IGNORE_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
}
Also used : Response(javax.ws.rs.core.Response) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) Test(org.junit.Test)

Example 92 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveMetadataResourceTest method testSimpleWord.

@Test
public void testSimpleWord() throws Exception {
    Response response = WebClient.create(endPoint + META_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
    Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
    List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
    assertEquals(12, metadataList.size());
    assertEquals("Microsoft Office Word", metadataList.get(0).get("Application-Name"));
    assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
    assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
}
Also used : Response(javax.ws.rs.core.Response) InputStreamReader(java.io.InputStreamReader) Metadata(org.apache.tika.metadata.Metadata) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) Test(org.junit.Test)

Example 93 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method testIgnoreCharsetDetectorLanguage.

/**
     * Test case for TIKA-339: Don't use language returned by CharsetDetector
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
     */
@Test
public void testIgnoreCharsetDetectorLanguage() throws Exception {
    String test = "<html><title>Simple Content</title><body></body></html>";
    Metadata metadata = new Metadata();
    metadata.add(Metadata.CONTENT_LANGUAGE, "en");
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 94 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class HtmlParserTest method testImgUrlExtraction.

/**
     * Test case for TIKA-463. Don't skip elements that have URLs.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
     */
@Test
public void testImgUrlExtraction() throws Exception {
    final String test = "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" + "</head><body><img src=\"image.jpg\" /></body></html>";
    StringWriter sw = new StringWriter();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), new Metadata(), new ParseContext());
    String result = sw.toString();
    // <img> tag should exist, with fully resolved URL
    assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
Also used : StringWriter(java.io.StringWriter) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 95 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class SXSLFExtractorTest method testMacrosInPptm.

@Test
public void testMacrosInPptm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
    List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
    //test default is "don't extract macros"
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they are extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXPptxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29