use of org.codelibs.fess.crawler.util.IgnoreCloseInputStream in project fess-crawler by codelibs.
the class TarExtractor method getTextInternal.
protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) {
final StringBuilder buf = new StringBuilder(1000);
ArchiveInputStream ais = null;
try {
ais = archiveStreamFactory.createArchiveInputStream("tar", in);
TarArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
} finally {
CloseableUtil.closeQuietly(ais);
}
return buf.toString().trim();
}
use of org.codelibs.fess.crawler.util.IgnoreCloseInputStream in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.util.IgnoreCloseInputStream in project fess-crawler by codelibs.
the class LhaExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
File tempFile = null;
LhaFile lhaFile = null;
try {
tempFile = File.createTempFile("crawler-", ".lzh");
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
CopyUtil.copy(in, fos);
}
lhaFile = new LhaFile(tempFile);
@SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
long contentSize = 0;
while (entries.hasMoreElements()) {
final LhaHeader head = entries.nextElement();
contentSize += head.getOriginalSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = head.getPath();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
InputStream is = null;
try {
is = lhaFile.getInputStream(head);
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
} finally {
CloseableUtil.closeQuietly(is);
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (lhaFile != null) {
try {
lhaFile.close();
} catch (final IOException e) {
// ignore
}
}
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
return new ExtractData(buf.toString().trim());
}
Aggregations