use of org.codelibs.fess.crawler.extractor.Extractor in project fess by codelibs.
the class FessFileTransformer method getExtractor.
@Override
protected Extractor getExtractor(final ResponseData responseData) {
final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();
if (extractorFactory == null) {
throw new FessSystemException("Could not find extractorFactory.");
}
final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
if (logger.isDebugEnabled()) {
logger.debug("url={}, extractor={}", responseData.getUrl(), extractor);
}
return extractor;
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess by codelibs.
the class AbstractFessFileTransformer method generateData.
protected Map<String, Object> generateData(final ResponseData responseData) {
final Extractor extractor = getExtractor(responseData);
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
final String mimeType = responseData.getMimeType();
params.put(HttpHeaders.CONTENT_TYPE, mimeType);
params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
final StringBuilder contentMetaBuf = new StringBuilder(1000);
final Map<String, Object> dataMap = new HashMap<>();
final Map<String, Object> metaDataMap = new HashMap<>();
String content;
try (final InputStream in = responseData.getResponseBody()) {
final ExtractData extractData = getExtractData(extractor, in, params);
content = extractData.getContent();
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
return null;
}
if (getLogger().isDebugEnabled()) {
getLogger().debug("ExtractData: " + extractData);
}
// meta
//
extractData.getKeySet().stream().filter(//
k -> extractData.getValues(k) != null).forEach(key -> {
final String[] values = extractData.getValues(key);
metaDataMap.put(key, values);
if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
final String joinedValue = StringUtils.join(values, ' ');
if (StringUtil.isNotBlank(joinedValue)) {
if (contentMetaBuf.length() > 0) {
contentMetaBuf.append(' ');
}
contentMetaBuf.append(joinedValue.trim());
}
}
final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
if (mapping != null) {
if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), values);
} else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
final String joinedValue = StringUtils.join(values, ' ');
dataMap.put(mapping.getFirst(), joinedValue.trim());
} else if (values.length == 1) {
try {
if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
} else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
} else {
logger.warn("Unknown mapping type: {}={}", key, mapping);
}
} catch (final NumberFormatException e) {
logger.warn("Failed to parse " + values[0], e);
}
}
}
});
} catch (final Exception e) {
final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
rcae.setLogLevel(CrawlingAccessException.WARN);
throw rcae;
}
if (content == null) {
content = StringUtil.EMPTY;
}
final String contentMeta = contentMetaBuf.toString().trim();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// content
final StringBuilder buf = new StringBuilder(content.length() + 1000);
if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
buf.append(content);
}
if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
if (buf.length() > 0) {
buf.append(' ');
}
buf.append(contentMeta);
}
final String bodyBase = buf.toString().trim();
final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
// text cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
}
}
// digest
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
final String fileName = getFileName(url, urlEncoding);
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
if (url.endsWith("/")) {
if (StringUtil.isNotBlank(content)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
}
} else {
if (StringUtil.isBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
}
}
}
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
// filename
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// TODO anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
final Set<String> labelTypeSet = new HashSet<>();
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
labelTypeSet.add(labelType);
}
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
// role: roleType
final List<String> roleTypeList = getRoleTypes(responseData);
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// lang
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
}
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// from config
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
final String key = entry.getKey();
final String[] values = entry.getValue().split(",");
for (final String value : values) {
putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
}
}
final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
final String key = entry.getKey();
putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
}
return dataMap;
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class TikaExtractor method getText.
@Override
public ExtractData getText(final InputStream inputStream, final Map<String, String> params) {
if (inputStream == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final File tempFile;
final boolean isByteStream = inputStream instanceof ByteArrayInputStream;
if (isByteStream) {
inputStream.mark(0);
tempFile = null;
} else {
try {
tempFile = File.createTempFile("tikaExtractor-", ".out");
} catch (final IOException e) {
throw new ExtractException("Could not create a temp file.", e);
}
}
try {
final PrintStream originalOutStream = System.out;
final ByteArrayOutputStream outStream = new ByteArrayOutputStream();
System.setOut(new PrintStream(outStream, true));
final PrintStream originalErrStream = System.err;
final ByteArrayOutputStream errStream = new ByteArrayOutputStream();
System.setErr(new PrintStream(errStream, true));
try {
final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
final String contentType = params == null ? null : params.get(HttpHeaders.CONTENT_TYPE);
String contentEncoding = params == null ? null : params.get(HttpHeaders.CONTENT_ENCODING);
String pdfPassword = getPassword(params);
final Metadata metadata = createMetadata(resourceName, contentType, contentEncoding, pdfPassword);
final Parser parser = new TikaDetectParser();
final ParseContext parseContext = createParseContext(parser, params);
String content = getContent(writer -> {
InputStream in = null;
try {
if (!isByteStream) {
try (OutputStream out = new FileOutputStream(tempFile)) {
CopyUtil.copy(inputStream, out);
}
in = new FileInputStream(tempFile);
} else {
in = inputStream;
}
parser.parse(in, new BodyContentHandler(writer), metadata, parseContext);
} finally {
CloseableUtil.closeQuietly(in);
}
}, contentEncoding);
if (StringUtil.isBlank(content)) {
if (resourceName != null) {
if (logger.isDebugEnabled()) {
logger.debug("retry without a resource name: {}", resourceName);
}
final Metadata metadata2 = createMetadata(null, contentType, contentEncoding, pdfPassword);
content = getContent(writer -> {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
parser.parse(in, new BodyContentHandler(writer), metadata2, parseContext);
} finally {
CloseableUtil.closeQuietly(in);
}
}, contentEncoding);
}
if (StringUtil.isBlank(content) && contentType != null) {
if (logger.isDebugEnabled()) {
logger.debug("retry without a content type: {}", contentType);
}
final Metadata metadata3 = createMetadata(null, null, contentEncoding, pdfPassword);
content = getContent(writer -> {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
parser.parse(in, new BodyContentHandler(writer), metadata3, parseContext);
} finally {
CloseableUtil.closeQuietly(in);
}
}, contentEncoding);
}
if (readAsTextIfFailed && StringUtil.isBlank(content)) {
if (logger.isDebugEnabled()) {
logger.debug("read the content as a text.");
}
if (contentEncoding == null) {
contentEncoding = Constants.UTF_8;
}
final String enc = contentEncoding;
content = getContent(writer -> {
BufferedReader br = null;
try {
if (isByteStream) {
inputStream.reset();
br = new BufferedReader(new InputStreamReader(inputStream, enc));
} else {
br = new BufferedReader(new InputStreamReader(new FileInputStream(tempFile), enc));
}
String line;
while ((line = br.readLine()) != null) {
writer.write(line);
}
} catch (final Exception e) {
logger.warn("Could not read " + (tempFile != null ? tempFile.getAbsolutePath() : "a byte stream"), e);
} finally {
CloseableUtil.closeQuietly(br);
}
}, contentEncoding);
}
}
final ExtractData extractData = new ExtractData(content);
final String[] names = metadata.names();
Arrays.sort(names);
for (final String name : names) {
extractData.putValues(name, metadata.getValues(name));
}
if (logger.isDebugEnabled()) {
logger.debug("Result: metadata: {}", metadata);
}
return extractData;
} catch (final TikaException e) {
if (e.getMessage().indexOf("bomb") >= 0) {
throw e;
}
final Throwable cause = e.getCause();
if (cause instanceof SAXException) {
final Extractor xmlExtractor = crawlerContainer.getComponent("xmlExtractor");
if (xmlExtractor != null) {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
return xmlExtractor.getText(in, params);
} finally {
CloseableUtil.closeQuietly(in);
}
}
}
throw e;
} finally {
if (originalOutStream != null) {
System.setOut(originalOutStream);
}
if (originalErrStream != null) {
System.setErr(originalErrStream);
}
try {
if (logger.isInfoEnabled()) {
final byte[] bs = outStream.toByteArray();
if (bs.length != 0) {
logger.info(new String(bs, outputEncoding));
}
}
if (logger.isWarnEnabled()) {
final byte[] bs = errStream.toByteArray();
if (bs.length != 0) {
logger.warn(new String(bs, outputEncoding));
}
}
} catch (final Exception e) {
// NOP
}
}
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class JodExtractor method getOutputContent.
protected String getOutputContent(final File outputFile, final String outExt) {
final Extractor extractor = getExtractor(outExt);
if (extractor != null) {
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, outputFile.getName());
FileInputStream in = null;
try {
in = new FileInputStream(outputFile);
final ExtractData extractData = extractor.getText(in, params);
return extractData.getContent();
} catch (final FileNotFoundException e) {
throw new ExtractException("Could not open " + outputFile.getAbsolutePath(), e);
} finally {
CloseableUtil.closeQuietly(in);
}
}
try {
return new String(FileUtil.readBytes(outputFile), outputEncoding);
} catch (final UnsupportedEncodingException e) {
return new String(FileUtil.readBytes(outputFile), Constants.UTF_8_CHARSET);
}
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class LhaExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
File tempFile = null;
LhaFile lhaFile = null;
try {
tempFile = File.createTempFile("crawler-", ".lzh");
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
CopyUtil.copy(in, fos);
}
lhaFile = new LhaFile(tempFile);
@SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
long contentSize = 0;
while (entries.hasMoreElements()) {
final LhaHeader head = entries.nextElement();
contentSize += head.getOriginalSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = head.getPath();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
InputStream is = null;
try {
is = lhaFile.getInputStream(head);
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
} finally {
CloseableUtil.closeQuietly(is);
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (lhaFile != null) {
try {
lhaFile.close();
} catch (final IOException e) {
// ignore
}
}
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
return new ExtractData(buf.toString().trim());
}
Aggregations