use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class ForkParserTest method testPoolSizeReached.
@Test
public void testPoolSizeReached() throws Exception {
final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
try {
final Semaphore barrier = new Semaphore(0);
Thread[] threads = new Thread[parser.getPoolSize()];
PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
final ParseContext context = new ParseContext();
for (int i = 0; i < threads.length; i++) {
final PipedInputStream input = new PipedInputStream() {
@Override
public synchronized int read() throws IOException {
barrier.release();
return super.read();
}
};
pipes[i] = new PipedOutputStream(input);
threads[i] = new Thread() {
public void run() {
try {
ContentHandler o = new DefaultHandler();
parser.parse(input, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
threads[i].start();
}
// Wait until all the background parsers have been started
barrier.acquire(parser.getPoolSize());
final ContentHandler o = new BodyContentHandler();
Thread blocked = new Thread() {
public void run() {
try {
barrier.release();
InputStream stream = new ByteArrayInputStream(new byte[0]);
parser.parse(stream, o, new Metadata(), context);
} catch (Exception e) {
e.printStackTrace();
}
}
};
blocked.start();
// Wait until the last thread is started, and then some to
// make sure that it would have had a chance to start processing
// data had it not been blocked.
barrier.acquire();
Thread.sleep(1000);
assertEquals("", o.toString());
for (int i = 0; i < threads.length; i++) {
pipes[i].close();
threads[i].join();
}
blocked.join();
assertEquals("Hello, World!", o.toString().trim());
} finally {
parser.close();
}
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class RecursiveParserWrapperFSConsumer method processFileResource.
@Override
public boolean processFileResource(FileResource fileResource) {
Parser wrapped = parserFactory.getParser(tikaConfig);
RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
ParseContext context = new ParseContext();
// if (parseRecursively == true) {
context.set(Parser.class, parser);
// }
//try to open outputstream first
OutputStream os = getOutputStream(fsOSFactory, fileResource);
if (os == null) {
LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH));
return false;
}
//try to open the inputstream before the parse.
//if the parse hangs or throws a nasty exception, at least there will
//be a zero byte file there so that the batchrunner can skip that problematic
//file during the next run.
InputStream is = getInputStream(fileResource);
if (is == null) {
IOUtils.closeQuietly(os);
return false;
}
Throwable thrown = null;
List<Metadata> metadataList = null;
Metadata containerMetadata = fileResource.getMetadata();
try {
parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context);
metadataList = parser.getMetadata();
} catch (Throwable t) {
thrown = t;
metadataList = parser.getMetadata();
if (metadataList == null) {
metadataList = new LinkedList<>();
}
Metadata m = null;
if (metadataList.size() == 0) {
m = containerMetadata;
} else {
//take the top metadata item
m = metadataList.remove(0);
}
String stackTrace = ExceptionUtils.getFilteredStackTrace(t);
m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime", stackTrace);
metadataList.add(0, m);
} finally {
IOUtils.closeQuietly(is);
}
Writer writer = null;
try {
writer = new OutputStreamWriter(os, getOutputEncoding());
JsonMetadataList.toJson(metadataList, writer);
} catch (Exception e) {
//this is a stop the world kind of thing
LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e));
throw new RuntimeException(e);
} finally {
flushAndClose(writer);
}
if (thrown != null) {
if (thrown instanceof Error) {
throw (Error) thrown;
} else {
return false;
}
}
return true;
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TesseractOCRParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract(config))
return;
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
//trigger the spooling to a tmp file if the stream wasn't
//already a TikaInputStream that contained a file
tikaStream.getPath();
//this is the text output file name specified on the tesseract
//commandline. The actual output file name will have a suffix added.
File tmpOCROutputFile = tmp.createTemporaryFile();
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
_TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
xhtml.endDocument();
} finally {
tmp.dispose();
}
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class EpubParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Because an EPub file is often made up of multiple XHTML files,
// we need explicit control over the start and end of the document
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));
ZipInputStream zip = new ZipInputStream(stream);
ZipEntry entry = zip.getNextEntry();
while (entry != null) {
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
//often has trailing new lines
if (type != null) {
type = type.trim();
}
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals("metadata.xml")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".opf")) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) {
content.parse(zip, childHandler, metadata, context);
}
entry = zip.getNextEntry();
}
// Finish everything
xhtml.endDocument();
}
use of org.xml.sax.helpers.DefaultHandler in project tika by apache.
the class TesseractOCRParserTest method runOCR.
private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(handlerType, -1));
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(Parser.class, parser);
parseContext.set(PDFParserConfig.class, pdfConfig);
try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
}
List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
assertEquals(numMetadatas, metadataList.size());
StringBuilder contents = new StringBuilder();
for (Metadata m : metadataList) {
contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
}
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
assertTrue(metadataList.get(0).names().length > 10);
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
return contents.toString();
}
Aggregations