use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class TikaCLI method handleRecursiveJson.
private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
try (InputStream input = TikaInputStream.get(url, metadata)) {
wrapper.parse(input, null, metadata, context);
}
JsonMetadataList.setPrettyPrinting(prettyPrint);
Writer writer = getOutputWriter(output, encoding);
try {
JsonMetadataList.toJson(wrapper.getMetadata(), writer);
} finally {
writer.flush();
}
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class TikaGUI method handleStream.
private void handleStream(InputStream input, Metadata md) throws Exception {
StringWriter htmlBuffer = new StringWriter();
StringWriter textBuffer = new StringWriter();
StringWriter textMainBuffer = new StringWriter();
StringWriter xmlBuffer = new StringWriter();
StringBuilder metadataBuffer = new StringBuilder();
ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
if (input.markSupported()) {
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream) input).hasFile()) {
mark = (int) ((TikaInputStream) input).getLength();
}
}
if (mark == -1) {
mark = MAX_MARK;
}
input.mark(mark);
}
parser.parse(input, handler, md, context);
String[] names = md.names();
Arrays.sort(names);
for (String name : names) {
for (String val : md.getValues(name)) {
metadataBuffer.append(name);
metadataBuffer.append(": ");
metadataBuffer.append(val);
metadataBuffer.append("\n");
}
}
String name = md.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0) {
setTitle("Apache Tika: " + name);
} else {
setTitle("Apache Tika: unnamed document");
}
setText(metadata, metadataBuffer.toString());
setText(xml, xmlBuffer.toString());
setText(text, textBuffer.toString());
setText(textMain, textMainBuffer.toString());
setText(html, htmlBuffer.toString());
if (!input.markSupported()) {
setText(json, "InputStream does not support mark/reset for Recursive Parsing");
layout.show(cards, "metadata");
return;
}
boolean isReset = false;
try {
input.reset();
isReset = true;
} catch (IOException e) {
setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
}
if (isReset) {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
wrapper.parse(input, null, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
setText(json, jsonBuffer.toString());
}
layout.show(cards, "metadata");
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class RTFParserTest method testRegularImages.
//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
Parser base = new AutoDetectParser();
ParseContext ctx = new ParseContext();
RecursiveParserWrapper parser = new RecursiveParserWrapper(base, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ContentHandler handler = new BodyContentHandler();
Metadata rootMetadata = new Metadata();
rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
parser.parse(tis, handler, rootMetadata, ctx);
}
List<Metadata> metadatas = parser.getMetadata();
//("testJPEG_EXIF_普林斯顿.jpg");
Metadata meta_jpg_exif = metadatas.get(1);
//("testJPEG_普林斯顿.jpg");
Metadata meta_jpg = metadatas.get(3);
assertTrue(meta_jpg_exif != null);
assertTrue(meta_jpg != null);
assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
//make sure old metadata doesn't linger between objects
assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
assertEquals(49, meta_jpg.names().length);
assertEquals(113, meta_jpg_exif.names().length);
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class RecursiveMetadataResource method parseMetadata.
private MetadataList parseMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName) throws Exception {
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
Parser parser = TikaResource.createParser();
// TODO: parameterize choice of max chars/max embedded attachments
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(type, -1));
TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
// no need to add parser to parse recursively
TikaResource.fillParseContext(context, httpHeaders, null);
TikaResource.logRequest(LOG, info, metadata);
TikaResource.parse(wrapper, LOG, info.getPath(), is, new LanguageHandler() {
public void endDocument() {
metadata.set("language", getLanguage().getLanguage());
}
}, metadata, context);
return new MetadataList(wrapper.getMetadata());
}
use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class ParsingExample method recursiveParserWrapperExample.
/**
* For documents that may contain embedded documents, it might be helpful
* to create list of metadata objects, one for the container document and
* one for each embedded document. This allows easy access to both the
* extracted content and the metadata of each embedded document.
* Note that many document formats can contain embedded documents,
* including traditional container formats -- zip, tar and others -- but also
* common office document formats including: MSWord, MSExcel,
* MSPowerPoint, RTF, PDF, MSG and several others.
* <p>
* The "content" format is determined by the ContentHandlerFactory, and
* the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
* <p>
* The drawback to the RecursiveParserWrapper is that it caches metadata and contents
* in memory. This should not be used on files whose contents are too big to be handled
* in memory.
*
* @return a list of metadata object, one each for the container file and each embedded file
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
Parser p = new AutoDetectParser();
ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
ParseContext context = new ParseContext();
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
wrapper.parse(stream, new DefaultHandler(), metadata, context);
}
return wrapper.getMetadata();
}
Aggregations