use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.
the class SolrPluginUtils method copyNamedListIntoArrayByDocPosInResponse.
/** Copies the given {@code namedList} assumed to have doc uniqueKey keyed data into {@code destArr}
* at the position of the document in the response. destArr is assumed to be the same size as
* {@code resultIds} is. {@code resultIds} comes from {@link ResponseBuilder#resultIds}. If the doc key
* isn't in {@code resultIds} then it is ignored.
* Note: most likely you will call {@link #removeNulls(Map.Entry[], NamedList)} sometime after calling this. */
public static void copyNamedListIntoArrayByDocPosInResponse(NamedList namedList, Map<Object, ShardDoc> resultIds, Map.Entry<String, Object>[] destArr) {
assert resultIds.size() == destArr.length;
for (int i = 0; i < namedList.size(); i++) {
String id = namedList.getName(i);
// TODO: lookup won't work for non-string ids... String vs Float
ShardDoc sdoc = resultIds.get(id);
if (sdoc != null) {
// maybe null when rb.onePassDistributedQuery
int idx = sdoc.positionInResponse;
destArr[idx] = new NamedList.NamedListEntry<>(id, namedList.getVal(i));
}
}
}
use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.
the class SolrPluginUtils method explanationToNamedList.
public static NamedList<Object> explanationToNamedList(Explanation e) {
NamedList<Object> out = new SimpleOrderedMap<>();
out.add("match", e.isMatch());
out.add("value", e.getValue());
out.add("description", e.getDescription());
Explanation[] details = e.getDetails();
// short circut out
if (0 == details.length)
return out;
List<NamedList<Object>> kids = new ArrayList<>(details.length);
for (Explanation d : details) {
kids.add(explanationToNamedList(d));
}
out.add("details", kids);
return out;
}
use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.
the class ExtractingDocumentLoader method load.
@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
Parser parser = null;
String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
parser = autoDetectParser;
}
if (parser != null) {
Metadata metadata = new Metadata();
// If you specify the resource name (the filename, roughly) with this parameter,
// then Tika can make use of it in guessing the appropriate MIME type:
String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
if (resourceName != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's content type as hint for auto detection
if (stream.getContentType() != null) {
metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
}
InputStream inputStream = null;
try {
inputStream = stream.getStream();
metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if (charset != null) {
metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
ContentHandler parsingHandler = handler;
StringWriter writer = null;
BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
if (extractFormat.equals(TEXT_FORMAT)) {
serializer = new TextSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
} else {
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
}
if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin
serializer.startDocument();
parsingHandler = new MatchingContentHandler(serializer, matcher);
} else {
parsingHandler = serializer;
}
} else if (xpathExpr != null) {
Matcher matcher = PARSER.parse(xpathExpr);
parsingHandler = new MatchingContentHandler(handler, matcher);
}
try {
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = parseContextConfig.create();
context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
if (pwMapFile != null && pwMapFile.length() > 0) {
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
if (is != null) {
log.debug("Password file supplied: " + pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
if (resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file " + resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if (ignoreTikaException)
log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
else
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
if (extractOnly == false) {
addDoc(handler);
} else {
//serializer is not null, so we need to call endDoc on it if using xpath
if (xpathExpr != null) {
serializer.endDocument();
}
rsp.add(stream.getName(), writer.toString());
writer.close();
String[] names = metadata.names();
NamedList metadataNL = new NamedList();
for (int i = 0; i < names.length; i++) {
String[] vals = metadata.getValues(names[i]);
metadataNL.add(names[i], vals);
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
}
}
use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.
the class ExtractingRequestHandlerTest method testExtractOnly.
// Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's
// automatic MIME type detection will fail, and it will default to using an empty-string-returning default parser
@Test
public void testExtractOnly() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
SolrQueryResponse rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true");
assertTrue("rsp is null and it shouldn't be", rsp != null);
NamedList list = rsp.getValues();
String extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
NamedList nl = (NamedList) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
Object title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
assertTrue(extraction.indexOf("<?xml") != -1);
rsp = loadLocal("extraction/solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true", ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
assertTrue("rsp is null and it shouldn't be", rsp != null);
list = rsp.getValues();
extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
assertTrue(extraction.indexOf("<?xml") == -1);
nl = (NamedList) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
}
use of org.apache.solr.common.util.NamedList in project lucene-solr by apache.
the class ExtractingRequestHandler method inform.
@Override
public void inform(SolrCore core) {
if (initArgs != null) {
//if relative,then relative to config dir, otherwise, absolute path
String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
if (tikaConfigLoc != null) {
File configFile = new File(tikaConfigLoc);
if (configFile.isAbsolute() == false) {
configFile = new File(core.getResourceLoader().getConfigDir(), configFile.getPath());
}
try {
config = new TikaConfig(configFile);
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
if (parseContextConfigLoc != null) {
try {
parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
dateFormats = new HashSet<>();
Iterator<Map.Entry> it = configDateFormats.iterator();
while (it.hasNext()) {
String format = (String) it.next().getValue();
log.info("Adding Date Format: " + format);
dateFormats.add(format);
}
}
}
if (config == null) {
try {
config = getDefaultConfig(core.getResourceLoader().getClassLoader());
} catch (MimeTypeException | IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
if (parseContextConfig == null) {
parseContextConfig = new ParseContextConfig();
}
factory = createFactory();
}
Aggregations