use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveMetadataResourceTest method testHandlerType.
@Test
public void testHandlerType() throws Exception {
//default unspecified
Response response = WebClient.create(endPoint + META_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//extra slash
response = WebClient.create(endPoint + META_PATH + SLASH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//unparseable
response = WebClient.create(endPoint + META_PATH + UNPARSEABLE_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//xml
response = WebClient.create(endPoint + META_PATH + XML_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
//text
response = WebClient.create(endPoint + META_PATH + TEXT_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
assertTrue(content.startsWith("embed_3"));
//ignore
response = WebClient.create(endPoint + META_PATH + IGNORE_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveMetadataResourceTest method testSimpleWord.
@Test
public void testSimpleWord() throws Exception {
Response response = WebClient.create(endPoint + META_PATH).accept("application/json").put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
assertEquals("Microsoft Office Word", metadataList.get(0).get("Application-Name"));
assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5"));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method testIgnoreCharsetDetectorLanguage.
/**
* Test case for TIKA-339: Don't use language returned by CharsetDetector
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
*/
@Test
public void testIgnoreCharsetDetectorLanguage() throws Exception {
String test = "<html><title>Simple Content</title><body></body></html>";
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_LANGUAGE, "en");
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method testImgUrlExtraction.
/**
* Test case for TIKA-463. Don't skip elements that have URLs.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
*/
@Test
public void testImgUrlExtraction() throws Exception {
final String test = "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" + "</head><body><img src=\"image.jpg\" /></body></html>";
StringWriter sw = new StringWriter();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
// <img> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class SXSLFExtractorTest method testMacrosInPptm.
@Test
public void testMacrosInPptm() throws Exception {
Metadata parsedBy = new Metadata();
parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
//test default is "don't extract macros"
for (Metadata metadata : metadataList) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
assertContainsAtLeast(parsedBy, metadataList);
//now test that they are extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
officeParserConfig.setUseSAXPptxExtractor(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
assertContainsAtLeast(minExpected, metadataList);
assertContainsAtLeast(parsedBy, metadataList);
}
Aggregations