use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class JackcessParserTest method testBasic.
@Test
public void testBasic() throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper w = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
for (String fName : new String[] { "testAccess2.accdb", "testAccess2_2000.mdb", "testAccess2_2002-2003.mdb" }) {
InputStream is = null;
try {
is = this.getResourceAsStream("/test-documents/" + fName);
Metadata meta = new Metadata();
ParseContext c = new ParseContext();
w.parse(is, new DefaultHandler(), meta, c);
} finally {
IOUtils.closeQuietly(is);
}
List<Metadata> list = w.getMetadata();
assertEquals(4, list.size());
String mainContent = list.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
//make sure there's a thead and tbody
assertContains("</thead><tbody>", mainContent);
//assert table header
assertContains("<th>ShortTextField</th>", mainContent);
//test date format
assertContains("6/24/15", mainContent);
//test that markup is stripped
assertContains("over the bold italic dog", mainContent);
//test unicode
assertContains("普林斯顿大学", mainContent);
//test embedded document handling
assertContains("Test Document with embedded pdf", list.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
w.reset();
}
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class BasicTikaFSConsumersBuilder method getOutputStreamFactory.
private OutputStreamFactory getOutputStreamFactory(Node node, Map<String, String> runtimeAttributes, ContentHandlerFactory contentHandlerFactory, boolean useRecursiveParserWrapper) {
Map<String, String> attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
/* FSUtil.HANDLE_EXISTING handleExisting = null;
String handleExistingString = attrs.get("handleExisting");
if (handleExistingString == null) {
handleExistingException();
} else if (handleExistingString.equals("overwrite")){
handleExisting = FSUtil.HANDLE_EXISTING.OVERWRITE;
} else if (handleExistingString.equals("rename")) {
handleExisting = FSUtil.HANDLE_EXISTING.RENAME;
} else if (handleExistingString.equals("skip")) {
handleExisting = FSUtil.HANDLE_EXISTING.SKIP;
} else {
handleExistingException();
}
*/
String compressionString = attrs.get("compression");
FSOutputStreamFactory.COMPRESSION compression = FSOutputStreamFactory.COMPRESSION.NONE;
if (compressionString == null) {
//do nothing
} else if (compressionString.contains("bz")) {
compression = FSOutputStreamFactory.COMPRESSION.BZIP2;
} else if (compressionString.contains("gz")) {
compression = FSOutputStreamFactory.COMPRESSION.GZIP;
} else if (compressionString.contains("zip")) {
compression = FSOutputStreamFactory.COMPRESSION.ZIP;
}
String suffix = attrs.get("outputSuffix");
//suffix should not start with "."
if (suffix == null) {
StringBuilder sb = new StringBuilder();
if (useRecursiveParserWrapper) {
sb.append("json");
} else if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
}
appendCompression(compression, sb);
suffix = sb.toString();
}
//if the driver restarts and this is set to overwrite...
return new FSOutputStreamFactory(outputDir, FSUtil.HANDLE_EXISTING.SKIP, compression, suffix);
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class TikaGUI method handleStream.
private void handleStream(InputStream input, Metadata md) throws Exception {
StringWriter htmlBuffer = new StringWriter();
StringWriter textBuffer = new StringWriter();
StringWriter textMainBuffer = new StringWriter();
StringWriter xmlBuffer = new StringWriter();
StringBuilder metadataBuffer = new StringBuilder();
ContentHandler handler = new TeeContentHandler(getHtmlHandler(htmlBuffer), getTextContentHandler(textBuffer), getTextMainContentHandler(textMainBuffer), getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
input = TikaInputStream.get(new ProgressMonitorInputStream(this, "Parsing stream", input));
if (input.markSupported()) {
int mark = -1;
if (input instanceof TikaInputStream) {
if (((TikaInputStream) input).hasFile()) {
mark = (int) ((TikaInputStream) input).getLength();
}
}
if (mark == -1) {
mark = MAX_MARK;
}
input.mark(mark);
}
parser.parse(input, handler, md, context);
String[] names = md.names();
Arrays.sort(names);
for (String name : names) {
for (String val : md.getValues(name)) {
metadataBuffer.append(name);
metadataBuffer.append(": ");
metadataBuffer.append(val);
metadataBuffer.append("\n");
}
}
String name = md.get(Metadata.RESOURCE_NAME_KEY);
if (name != null && name.length() > 0) {
setTitle("Apache Tika: " + name);
} else {
setTitle("Apache Tika: unnamed document");
}
setText(metadata, metadataBuffer.toString());
setText(xml, xmlBuffer.toString());
setText(text, textBuffer.toString());
setText(textMain, textMainBuffer.toString());
setText(html, htmlBuffer.toString());
if (!input.markSupported()) {
setText(json, "InputStream does not support mark/reset for Recursive Parsing");
layout.show(cards, "metadata");
return;
}
boolean isReset = false;
try {
input.reset();
isReset = true;
} catch (IOException e) {
setText(json, "Error during stream reset.\n" + "There's a limit of " + MAX_MARK + " bytes for this type of processing in the GUI.\n" + "Try the app with command line argument of -J.");
}
if (isReset) {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
wrapper.parse(input, null, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
setText(json, jsonBuffer.toString());
}
layout.show(cards, "metadata");
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RTFParserTest method testRegularImages.
//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
Parser base = new AutoDetectParser();
ParseContext ctx = new ParseContext();
RecursiveParserWrapper parser = new RecursiveParserWrapper(base, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ContentHandler handler = new BodyContentHandler();
Metadata rootMetadata = new Metadata();
rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
parser.parse(tis, handler, rootMetadata, ctx);
}
List<Metadata> metadatas = parser.getMetadata();
//("testJPEG_EXIF_普林斯顿.jpg");
Metadata meta_jpg_exif = metadatas.get(1);
//("testJPEG_普林斯顿.jpg");
Metadata meta_jpg = metadatas.get(3);
assertTrue(meta_jpg_exif != null);
assertTrue(meta_jpg != null);
assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
//make sure old metadata doesn't linger between objects
assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
assertEquals(49, meta_jpg.names().length);
assertEquals(113, meta_jpg_exif.names().length);
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveMetadataResource method parseMetadata.
private MetadataList parseMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName) throws Exception {
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
Parser parser = TikaResource.createParser();
// TODO: parameterize choice of max chars/max embedded attachments
BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, new BasicContentHandlerFactory(type, -1));
TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
// no need to add parser to parse recursively
TikaResource.fillParseContext(context, httpHeaders, null);
TikaResource.logRequest(LOG, info, metadata);
TikaResource.parse(wrapper, LOG, info.getPath(), is, new LanguageHandler() {
public void endDocument() {
metadata.set("language", getLanguage().getLanguage());
}
}, metadata, context);
return new MetadataList(wrapper.getMetadata());
}
Aggregations