use of org.apache.tika.exception.TikaException in project tika by apache.
the class ParseContext method getDocumentBuilder.
/**
* Returns the DOM builder specified in this parsing context.
* If a builder is not explicitly specified, then a builder
* instance is created and returned. The builder instance is
* configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
* and it sets the ErrorHandler to <code>null</code>.
*
* @since Apache Tika 1.13
* @return DOM Builder
*/
public DocumentBuilder getDocumentBuilder() throws TikaException {
DocumentBuilder documentBuilder = get(DocumentBuilder.class);
if (documentBuilder != null) {
return documentBuilder;
}
try {
DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
documentBuilder = documentBuilderFactory.newDocumentBuilder();
documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
documentBuilder.setErrorHandler(null);
return documentBuilder;
} catch (ParserConfigurationException e) {
throw new TikaException("XML parser not available", e);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class ParseContext method getXMLReader.
/**
* Returns the XMLReader specified in this parsing context. If a reader
* is not explicitly specified, then one is created using the specified
* or the default SAX parser.
*
* @see #getSAXParser()
* @since Apache Tika 1.13
* @return XMLReader
* @throws TikaException
*/
public XMLReader getXMLReader() throws TikaException {
XMLReader reader = get(XMLReader.class);
if (reader != null) {
return reader;
}
try {
reader = getSAXParser().getXMLReader();
} catch (SAXException e) {
throw new TikaException("Unable to create an XMLReader", e);
}
reader.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
return reader;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class TikaConfigTest method testUnknownParser.
/**
* Make sure that with a service loader given, we can
* get different configurable behaviour on parser classes
* which can't be found.
*/
@Test
public void testUnknownParser() throws Exception {
ServiceLoader ignoreLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.IGNORE);
ServiceLoader warnLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.WARN);
ServiceLoader throwLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.THROW);
Path configPath = Paths.get(new URI(getConfigPath("TIKA-1700-unknown-parser.xml")));
TikaConfig ignore = new TikaConfig(configPath, ignoreLoader);
assertNotNull(ignore);
assertNotNull(ignore.getParser());
assertEquals(1, ((CompositeParser) ignore.getParser()).getAllComponentParsers().size());
TikaConfig warn = new TikaConfig(configPath, warnLoader);
assertNotNull(warn);
assertNotNull(warn.getParser());
assertEquals(1, ((CompositeParser) warn.getParser()).getAllComponentParsers().size());
try {
new TikaConfig(configPath, throwLoader);
fail("Shouldn't get here, invalid parser class");
} catch (TikaException expected) {
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class TikaConfigTest method defaultParserWithExcludes.
/**
* TIKA-1445 It should be possible to exclude DefaultParser from
* certain types, so another parser explicitly listed will take them
*/
@Test
public void defaultParserWithExcludes() throws Exception {
try {
TikaConfig config = getConfig("TIKA-1445-default-except.xml");
CompositeParser cp = (CompositeParser) config.getParser();
List<Parser> parsers = cp.getAllComponentParsers();
Parser p;
// Will be the three parsers defined in the xml
assertEquals(3, parsers.size());
// Should have a wrapped DefaultParser, not the main DefaultParser,
// as it is excluded from handling certain classes
p = parsers.get(0);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
// Should have two others which claim things, which they wouldn't
// otherwise handle
p = parsers.get(1);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
p = parsers.get(2);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
} catch (TikaException e) {
fail("Unexpected TikaException: " + e);
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class RTFObjDataParser method parse.
/**
* Parses the embedded object/pict string
*
* @param bytes actual bytes (already converted from the
* hex pair string stored in the embedded object data into actual bytes or read
* as raw binary bytes)
* @return a SimpleRTFEmbObj or null
* @throws IOException if there are any surprise surprises during parsing
*/
/**
* @param bytes
* @param metadata incoming metadata
* @param unknownFilenameCount
* @return byte[] for contents of obj data
* @throws IOException
*/
protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException, TikaException {
ByteArrayInputStream is = new ByteArrayInputStream(bytes);
long version = readUInt(is);
metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
long formatId = readUInt(is);
//2 is an embedded object. 1 is a link.
if (formatId != 2L) {
return null;
}
String className = readLengthPrefixedAnsiString(is).trim();
String topicName = readLengthPrefixedAnsiString(is).trim();
String itemName = readLengthPrefixedAnsiString(is).trim();
if (className != null && className.length() > 0) {
metadata.add(RTFMetadata.EMB_CLASS, className);
}
if (topicName != null && topicName.length() > 0) {
metadata.add(RTFMetadata.EMB_TOPIC, topicName);
}
if (itemName != null && itemName.length() > 0) {
metadata.add(RTFMetadata.EMB_ITEM, itemName);
}
long dataSz = readUInt(is);
//readBytes tests for reading too many bytes
byte[] embObjBytes = readBytes(is, dataSz);
if (className.toLowerCase(Locale.ROOT).equals("package")) {
return handlePackage(embObjBytes, metadata);
} else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
//simple bitmap bytes
return embObjBytes;
} else {
ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
boolean hasPoifs = false;
try {
hasPoifs = NPOIFSFileSystem.hasPOIFSHeader(embIs);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return embObjBytes;
}
if (hasPoifs) {
try {
return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
}
}
}
return embObjBytes;
}
Aggregations