use of org.codelibs.fess.crawler.util.IgnoreCloseInputStream in project fess-crawler by codelibs.
the class TarExtractor method getTextInternal.
protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) {
final StringBuilder buf = new StringBuilder(1000);
ArchiveInputStream ais = null;
try {
ais = archiveStreamFactory.createArchiveInputStream("tar", in);
TarArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
} finally {
return buf.toString().trim();
use of org.codelibs.fess.crawler.util.IgnoreCloseInputStream in project fess-crawler by codelibs.
the class ZipExtractor method getText.
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
return new ExtractData(buf.toString().trim());
use of org.codelibs.fess.crawler.util.IgnoreCloseInputStream in project fess-crawler by codelibs.
the class LhaExtractor method getText.
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
File tempFile = null;
LhaFile lhaFile = null;
try {
tempFile = File.createTempFile("crawler-", ".lzh");
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
CopyUtil.copy(in, fos);
lhaFile = new LhaFile(tempFile);
@SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
long contentSize = 0;
while (entries.hasMoreElements()) {
final LhaHeader head = entries.nextElement();
contentSize += head.getOriginalSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
final String filename = head.getPath();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
InputStream is = null;
try {
is = lhaFile.getInputStream(head);
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
} finally {
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (lhaFile != null) {
try {
} catch (final IOException e) {
// ignore
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
return new ExtractData(buf.toString().trim());