use of org.apache.tika.parser.DefaultParser in project lucene-solr by apache.
the class ExtractingDocumentLoader method load.
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
Parser parser = null;
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
parser = autoDetectParser;
}
if (parser != null) {
Metadata metadata = new Metadata();
// If you specify the resource name (the filename, roughly) with this parameter,
// then Tika can make use of it in guessing the appropriate MIME type:
String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
if (resourceName != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's content type as hint for auto detection
if (stream.getContentType() != null) {
metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
}
InputStream inputStream = null;
try {
inputStream = stream.getStream();
metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if (charset != null) {
metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
ContentHandler parsingHandler = handler;
StringWriter writer = null;
BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
if (extractFormat.equals(TEXT_FORMAT)) {
serializer = new TextSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
} else {
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
}
if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin
serializer.startDocument();
parsingHandler = new MatchingContentHandler(serializer, matcher);
} else {
parsingHandler = serializer;
}
} else if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(handler, matcher);
}
try {
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = parseContextConfig.create();
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
if (pwMapFile != null && pwMapFile.length() > 0) {
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
if (is != null) {
log.debug("Password file supplied: " + pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
if (resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file " + resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if (ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
//serializer is not null, so we need to call endDoc on it if using xpath
if (xpathExpr != null) {
serializer.endDocument();
}
rsp.add(stream.getName(), writer.toString());
writer.close();
String[] names = metadata.names();
NamedList metadataNL = new NamedList();
for (int i = 0; i < names.length; i++) {
String[] vals = metadata.getValues(names[i]);
metadataNL.add(names[i], vals);
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
}
}
use of org.apache.tika.parser.DefaultParser in project tika by apache.
the class TikaConfigTest method defaultParserWithExcludes.
/**
* TIKA-1445 It should be possible to exclude DefaultParser from
* certain types, so another parser explicitly listed will take them
*/
@Test
public void defaultParserWithExcludes() throws Exception {
try {
TikaConfig config = getConfig("TIKA-1445-default-except.xml");
CompositeParser cp = (CompositeParser) config.getParser();
List<Parser> parsers = cp.getAllComponentParsers();
Parser p;
// Will be the three parsers defined in the xml
assertEquals(3, parsers.size());
// Should have a wrapped DefaultParser, not the main DefaultParser,
// as it is excluded from handling certain classes
p = parsers.get(0);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
// Should have two others which claim things, which they wouldn't
// otherwise handle
p = parsers.get(1);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
p = parsers.get(2);
assertTrue(p.toString(), p instanceof ParserDecorator);
assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
} catch (TikaException e) {
fail("Unexpected TikaException: " + e);
}
}
use of org.apache.tika.parser.DefaultParser in project tika by apache.
the class TikaParserConfigTest method testMimeExcludeInclude.
@Test
public void testMimeExcludeInclude() throws Exception {
TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
Parser parser = config.getParser();
MediaType PDF = MediaType.application("pdf");
MediaType JPEG = MediaType.image("jpeg");
// Has two parsers
assertEquals(CompositeParser.class, parser.getClass());
CompositeParser cParser = (CompositeParser) parser;
assertEquals(2, cParser.getAllComponentParsers().size());
// Both are decorated
assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
// DefaultParser will be wrapped with excludes
assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
assertNotContained(PDF, p0.getSupportedTypes(context));
assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
assertNotContained(JPEG, p0.getSupportedTypes(context));
assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
// Will have an empty parser for PDF
assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
assertEquals(1, p1.getSupportedTypes(context).size());
assertContains(PDF, p1.getSupportedTypes(context));
assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
use of org.apache.tika.parser.DefaultParser in project tika by apache.
the class TikaParserConfigTest method defaultParserBlacklist.
/**
* TIKA-1558 It should be possible to exclude Parsers from being picked up by
* DefaultParser.
*/
@Test
public void defaultParserBlacklist() throws Exception {
TikaConfig config = new TikaConfig();
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
CompositeParser cp = (CompositeParser) config.getParser();
List<Parser> parsers = cp.getAllComponentParsers();
boolean hasXML = false;
for (Parser p : parsers) {
if (p instanceof XMLParser) {
hasXML = true;
break;
}
}
assertTrue("Default config should include an XMLParser.", hasXML);
// This custom TikaConfig should exclude XMLParser and all of its subclasses.
config = getConfig("TIKA-1558-blacklistsub.xml");
cp = (CompositeParser) config.getParser();
parsers = cp.getAllComponentParsers();
for (Parser p : parsers) {
if (p instanceof XMLParser)
fail("Custom config should not include an XMLParser (" + p.getClass() + ").");
}
}
use of org.apache.tika.parser.DefaultParser in project tika by apache.
the class BundleIT method testBundleParsers.
@Test
public void testBundleParsers() throws Exception {
// Get the classes found within OSGi
ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class);
DefaultParser parserService = (DefaultParser) bc.getService(parserRef);
Set<String> osgiParsers = new HashSet<>();
for (Parser p : parserService.getAllComponentParsers()) {
osgiParsers.add(p.getClass().getName());
}
// Check we did get a few, just in case...
assertTrue("Should have lots Parser names, found " + osgiParsers.size(), osgiParsers.size() > 15);
// Get the raw parsers list from the traditional service loading mechanism
CompositeParser parser = (CompositeParser) defaultParser;
Set<String> rawParsers = new HashSet<>();
for (Parser p : parser.getAllComponentParsers()) {
if (p instanceof DefaultParser) {
for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) {
rawParsers.add(pChild.getClass().getName());
}
} else {
rawParsers.add(p.getClass().getName());
}
}
assertEquals(rawParsers, osgiParsers);
}
Aggregations