use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class AbstractExtractor method register.
public void register(final List<String> keyList) {
final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
extractorFactory.addExtractor(keyList, this);
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess by codelibs.
the class FessStandardTransformer method getExtractor.
@Override
protected Extractor getExtractor(final ResponseData responseData) {
final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();
if (extractorFactory == null) {
throw new FessSystemException("Could not find extractorFactory.");
}
Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
if (extractor == null) {
extractor = ComponentUtil.getComponent("tikaExtractor");
if (extractor == null) {
throw new FessSystemException("Could not find tikaExtractor.");
}
}
if (logger.isDebugEnabled()) {
logger.debug("url={}, extractor={}", responseData.getUrl(), extractor);
}
return extractor;
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess by codelibs.
the class FessFileTransformer method getExtractor.
@Override
protected Extractor getExtractor(final ResponseData responseData) {
final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();
if (extractorFactory == null) {
throw new FessSystemException("Could not find extractorFactory.");
}
final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
if (logger.isDebugEnabled()) {
logger.debug("url={}, extractor={}", responseData.getUrl(), extractor);
}
return extractor;
}
use of org.codelibs.fess.crawler.extractor.ExtractorFactory in project fess-crawler by codelibs.
the class TarExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory));
}
Aggregations