use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class TextTransformer method transform.
/*
* (non-Javadoc)
*
* @see
* org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.fess.crawler.entity.ResponseData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
if (extractorFactory == null) {
throw new CrawlerSystemException("Could not find extractorFactory.");
}
final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
String content = null;
try (final InputStream in = responseData.getResponseBody()) {
content = extractor.getText(in, params).getContent();
} catch (final Exception e) {
throw new CrawlingAccessException("Could not extract data.", e);
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try {
resultData.setData(content.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(content.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
return resultData;
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class EmlExtractor method appendAttachment.
protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) {
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
try {
final String filename = getDecodeText(bodyPart.getFileName());
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try (final InputStream in = bodyPart.getInputStream()) {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
final String content = extractor.getText(in, map).getContent();
buf.append(content).append(' ');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
} catch (MessagingException e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in parsing BodyPart.", e);
}
}
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class TarExtractor method getTextInternal.
protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) {
final StringBuilder buf = new StringBuilder(1000);
ArchiveInputStream ais = null;
try {
ais = archiveStreamFactory.createArchiveInputStream("tar", in);
TarArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
} finally {
CloseableUtil.closeQuietly(ais);
}
return buf.toString().trim();
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.extractor.Extractor in project fess by codelibs.
the class FessStandardTransformer method getExtractor.
@Override
protected Extractor getExtractor(final ResponseData responseData) {
final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();
if (extractorFactory == null) {
throw new FessSystemException("Could not find extractorFactory.");
}
Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
if (extractor == null) {
extractor = ComponentUtil.getComponent("tikaExtractor");
if (extractor == null) {
throw new FessSystemException("Could not find tikaExtractor.");
}
}
if (logger.isDebugEnabled()) {
logger.debug("url={}, extractor={}", responseData.getUrl(), extractor);
}
return extractor;
}
Aggregations