use of org.apache.tika.metadata.Metadata in project jackrabbit by apache.
the class NodeIndexer method addBinaryValue.
/**
* Adds the binary value to the document as the named field.
* <p>
* This implementation checks if this {@link #node} is of type nt:resource
* and if that is the case, tries to extract text from the binary property
* using the {@link #parser}.
*
* @param doc The document to which to add the field
* @param fieldName The name of the field to add
* @param internalValue The value for the field to add to the document.
*/
protected void addBinaryValue(Document doc, String fieldName, InternalValue internalValue) {
// 'check' if node is of type nt:resource
try {
String jcrData = mappings.getPrefix(Name.NS_JCR_URI) + ":data";
if (!jcrData.equals(fieldName)) {
// don't know how to index
return;
}
InternalValue type = getValue(NameConstants.JCR_MIMETYPE);
if (type != null && isSupportedMediaType(type.getString())) {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, type.getString());
// jcr:encoding is not mandatory
InternalValue encoding = getValue(NameConstants.JCR_ENCODING);
if (encoding != null) {
metadata.set(Metadata.CONTENT_ENCODING, encoding.getString());
}
doc.add(createFulltextField(internalValue, metadata, false));
}
} catch (Throwable t) {
// TODO: How to recover from a transient indexing failure?
log.warn("Exception while indexing binary property", t);
}
}
use of org.apache.tika.metadata.Metadata in project jackrabbit-oak by apache.
the class SolrIndexEditor method extractTextValues.
private List<String> extractTextValues(PropertyState property, NodeState state) {
List<String> values = new LinkedList<String>();
Metadata metadata = new Metadata();
if (JCR_DATA.equals(property.getName())) {
String type = state.getString(JcrConstants.JCR_MIMETYPE);
if (type != null) {
// not mandatory
metadata.set(Metadata.CONTENT_TYPE, type);
}
String encoding = state.getString(JcrConstants.JCR_ENCODING);
if (encoding != null) {
// not mandatory
metadata.set(Metadata.CONTENT_ENCODING, encoding);
}
}
for (Blob v : property.getValue(Type.BINARIES)) {
values.add(parseStringValue(v, metadata));
}
return values;
}
use of org.apache.tika.metadata.Metadata in project lucene-solr by apache.
the class TikaEntityProcessor method nextRow.
@Override
public Map<String, Object> nextRow() {
if (done)
return null;
Map<String, Object> row = new HashMap<>();
DataSource<InputStream> dataSource = context.getDataSource();
InputStream is = dataSource.getData(context.getResolvedEntityAttribute(URL));
ContentHandler contentHandler = null;
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
try {
if ("html".equals(format)) {
contentHandler = getHtmlHandler(sw);
} else if ("xml".equals(format)) {
contentHandler = getXmlContentHandler(sw);
} else if ("text".equals(format)) {
contentHandler = getTextContentHandler(sw);
} else if ("none".equals(format)) {
contentHandler = new DefaultHandler();
}
} catch (TransformerConfigurationException e) {
wrapAndThrow(SEVERE, e, "Unable to create content handler");
}
Parser tikaParser = null;
if (parser.equals(AUTO_PARSER)) {
tikaParser = new AutoDetectParser(tikaConfig);
} else {
tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
}
try {
ParseContext context = new ParseContext();
if ("identity".equals(htmlMapper)) {
context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
}
if (extractEmbedded) {
context.set(Parser.class, tikaParser);
}
tikaParser.parse(is, contentHandler, metadata, context);
} catch (Exception e) {
if (SKIP.equals(onError)) {
throw new DataImportHandlerException(DataImportHandlerException.SKIP_ROW, "Document skipped :" + e.getMessage());
}
wrapAndThrow(SEVERE, e, "Unable to read content");
}
IOUtils.closeQuietly(is);
for (Map<String, String> field : context.getAllEntityFields()) {
if (!"true".equals(field.get("meta")))
continue;
String col = field.get(COLUMN);
String s = metadata.get(col);
if (s != null)
row.put(col, s);
}
if (!"none".equals(format))
row.put("text", sw.toString());
tryToAddLatLon(metadata, row);
done = true;
return row;
}
use of org.apache.tika.metadata.Metadata in project lucene-solr by apache.
the class ExtractingDocumentLoader method load.
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
Parser parser = null;
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
parser = autoDetectParser;
}
if (parser != null) {
Metadata metadata = new Metadata();
// If you specify the resource name (the filename, roughly) with this parameter,
// then Tika can make use of it in guessing the appropriate MIME type:
String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
if (resourceName != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's content type as hint for auto detection
if (stream.getContentType() != null) {
metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
}
InputStream inputStream = null;
try {
inputStream = stream.getStream();
metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if (charset != null) {
metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
ContentHandler parsingHandler = handler;
StringWriter writer = null;
BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
if (extractFormat.equals(TEXT_FORMAT)) {
serializer = new TextSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
} else {
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
}
if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin
serializer.startDocument();
parsingHandler = new MatchingContentHandler(serializer, matcher);
} else {
parsingHandler = serializer;
}
} else if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(handler, matcher);
}
try {
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = parseContextConfig.create();
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
if (pwMapFile != null && pwMapFile.length() > 0) {
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
if (is != null) {
log.debug("Password file supplied: " + pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
if (resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file " + resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if (ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
//serializer is not null, so we need to call endDoc on it if using xpath
if (xpathExpr != null) {
serializer.endDocument();
}
rsp.add(stream.getName(), writer.toString());
writer.close();
String[] names = metadata.names();
NamedList metadataNL = new NamedList();
for (int i = 0; i < names.length; i++) {
String[] vals = metadata.getValues(names[i]);
metadataNL.add(names[i], vals);
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaTest method getRecursiveMetadata.
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
}
return wrapper.getMetadata();
}
Aggregations