use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class TextTransformer method transform.
/*
* (non-Javadoc)
*
* @see
* org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.fess.crawler.entity.ResponseData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
if (extractorFactory == null) {
throw new CrawlerSystemException("Could not find extractorFactory.");
}
final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
String content = null;
try (final InputStream in = responseData.getResponseBody()) {
content = extractor.getText(in, params).getContent();
} catch (final Exception e) {
throw new CrawlingAccessException("Could not extract data.", e);
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try {
resultData.setData(content.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(content.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
return resultData;
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class LhaExtractorTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer();
container.singleton("mimeTypeHelper", MimeTypeHelperImpl.class).singleton("tikaExtractor", TikaExtractor.class).singleton("lhaExtractor", LhaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
LhaExtractor lhaExtractor = container.getComponent("lhaExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
factory.addExtractor("application/x-lha", lhaExtractor);
});
lhaExtractor = container.getComponent("lhaExtractor");
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class TarExtractorTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer();
container.singleton("archiveStreamFactory", ArchiveStreamFactory.class).singleton("compressorStreamFactory", CompressorStreamFactory.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).singleton("tikaExtractor", TikaExtractor.class).singleton("tarExtractor", TarExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
TarExtractor tarExtractor = container.getComponent("tarExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
factory.addExtractor("application/tar", tarExtractor);
});
tarExtractor = container.getComponent("tarExtractor");
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class ZipExtractorTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer();
container.singleton("archiveStreamFactory", ArchiveStreamFactory.class).singleton("compressorStreamFactory", CompressorStreamFactory.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).singleton("tikaExtractor", TikaExtractor.class).singleton("zipExtractor", ZipExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
ZipExtractor zipExtractor = container.getComponent("zipExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
factory.addExtractor("application/zip", zipExtractor);
});
zipExtractor = container.getComponent("zipExtractor");
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class EmlExtractor method appendAttachment.
protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) {
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
try {
final String filename = getDecodeText(bodyPart.getFileName());
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try (final InputStream in = bodyPart.getInputStream()) {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
final String content = extractor.getText(in, map).getContent();
buf.append(content).append(' ');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
} catch (MessagingException e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in parsing BodyPart.", e);
}
}
}
Aggregations