use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class AbstractXmlExtractor method getText.
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
try {
final BufferedInputStream bis = new BufferedInputStream(in);
final String enc = getEncoding(bis);
final String content = UNESCAPE_HTML4.translate(new String(InputStreamUtil.getBytes(bis), enc));
return new ExtractData(extractString(content));
} catch (final Exception e) {
throw new ExtractException(e);
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class TikaExtractorTest method test_getTika_zip_bom.
public void test_getTika_zip_bom() {
final InputStream in = ResourceUtil.getResourceAsStream("extractor/zip/test_size.zip");
tikaExtractor.maxCompressionRatio = 1;
tikaExtractor.maxUncompressionSize = 10000;
try {
tikaExtractor.getText(in, null);
fail();
} catch (final ExtractException e) {
logger.info(e.getMessage());
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class PdfExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
synchronized (pdfBoxLockObj) {
// PDFBox is not a thread-safe library
final String password = getPassword(params);
try (PDDocument document = PDDocument.load(in, password == null ? null : password)) {
final StringWriter output = new StringWriter();
final PDFTextStripper stripper = new PDFTextStripper();
final AtomicBoolean done = new AtomicBoolean(false);
final PDDocument doc = document;
final Set<Exception> exceptionSet = new HashSet<>();
final Thread task = new Thread(() -> {
try {
stripper.writeText(doc, output);
} catch (final Exception e) {
exceptionSet.add(e);
} finally {
done.set(true);
}
}, Thread.currentThread().getName() + "-pdf");
task.setDaemon(isDaemonThread);
task.start();
task.join(timeout);
if (!done.get()) {
for (int i = 0; i < 100 && !done.get(); i++) {
task.interrupt();
Thread.sleep(100);
}
throw new ExtractException("PDFBox process cannot finish in " + timeout + " sec.");
} else if (!exceptionSet.isEmpty()) {
throw exceptionSet.iterator().next();
}
output.flush();
final ExtractData extractData = new ExtractData(output.toString());
extractMetadata(document, extractData);
return extractData;
} catch (final Exception e) {
throw new ExtractException(e);
}
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class TikaExtractor method getContent.
protected String getContent(final ContentWriter out, final String encoding) throws TikaException {
File tempFile = null;
try {
tempFile = File.createTempFile("tika", ".tmp");
} catch (final IOException e) {
throw new CrawlerSystemException("Failed to create a temp file.", e);
}
final String enc = encoding == null ? Constants.UTF_8 : encoding;
try (DeferredFileOutputStream dfos = new DeferredFileOutputStream(memorySize, tempFile)) {
final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(dfos, enc));
out.accept(writer);
writer.flush();
try (Reader reader = new InputStreamReader(getContentStream(dfos), enc)) {
return TextUtil.normalizeText(reader).initialCapacity(initialBufferSize).maxAlphanumTermSize(maxAlphanumTermSize).maxSymbolTermSize(maxSymbolTermSize).duplicateTermRemoved(replaceDuplication).execute();
}
} catch (final TikaException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Failed to read a content.", e);
} finally {
if (tempFile.exists() && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class TikaExtractor method getText.
@Override
public ExtractData getText(final InputStream inputStream, final Map<String, String> params) {
if (inputStream == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final File tempFile;
final boolean isByteStream = inputStream instanceof ByteArrayInputStream;
if (isByteStream) {
inputStream.mark(0);
tempFile = null;
} else {
try {
tempFile = File.createTempFile("tikaExtractor-", ".out");
} catch (final IOException e) {
throw new ExtractException("Could not create a temp file.", e);
}
}
try {
final PrintStream originalOutStream = System.out;
final ByteArrayOutputStream outStream = new ByteArrayOutputStream();
System.setOut(new PrintStream(outStream, true));
final PrintStream originalErrStream = System.err;
final ByteArrayOutputStream errStream = new ByteArrayOutputStream();
System.setErr(new PrintStream(errStream, true));
try {
final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
final String contentType = params == null ? null : params.get(HttpHeaders.CONTENT_TYPE);
String contentEncoding = params == null ? null : params.get(HttpHeaders.CONTENT_ENCODING);
String pdfPassword = getPassword(params);
final Metadata metadata = createMetadata(resourceName, contentType, contentEncoding, pdfPassword);
final Parser parser = new TikaDetectParser();
final ParseContext parseContext = createParseContext(parser, params);
String content = getContent(writer -> {
InputStream in = null;
try {
if (!isByteStream) {
try (OutputStream out = new FileOutputStream(tempFile)) {
CopyUtil.copy(inputStream, out);
}
in = new FileInputStream(tempFile);
} else {
in = inputStream;
}
parser.parse(in, new BodyContentHandler(writer), metadata, parseContext);
} finally {
CloseableUtil.closeQuietly(in);
}
}, contentEncoding);
if (StringUtil.isBlank(content)) {
if (resourceName != null) {
if (logger.isDebugEnabled()) {
logger.debug("retry without a resource name: {}", resourceName);
}
final Metadata metadata2 = createMetadata(null, contentType, contentEncoding, pdfPassword);
content = getContent(writer -> {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
parser.parse(in, new BodyContentHandler(writer), metadata2, parseContext);
} finally {
CloseableUtil.closeQuietly(in);
}
}, contentEncoding);
}
if (StringUtil.isBlank(content) && contentType != null) {
if (logger.isDebugEnabled()) {
logger.debug("retry without a content type: {}", contentType);
}
final Metadata metadata3 = createMetadata(null, null, contentEncoding, pdfPassword);
content = getContent(writer -> {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
parser.parse(in, new BodyContentHandler(writer), metadata3, parseContext);
} finally {
CloseableUtil.closeQuietly(in);
}
}, contentEncoding);
}
if (readAsTextIfFailed && StringUtil.isBlank(content)) {
if (logger.isDebugEnabled()) {
logger.debug("read the content as a text.");
}
if (contentEncoding == null) {
contentEncoding = Constants.UTF_8;
}
final String enc = contentEncoding;
content = getContent(writer -> {
BufferedReader br = null;
try {
if (isByteStream) {
inputStream.reset();
br = new BufferedReader(new InputStreamReader(inputStream, enc));
} else {
br = new BufferedReader(new InputStreamReader(new FileInputStream(tempFile), enc));
}
String line;
while ((line = br.readLine()) != null) {
writer.write(line);
}
} catch (final Exception e) {
logger.warn("Could not read " + (tempFile != null ? tempFile.getAbsolutePath() : "a byte stream"), e);
} finally {
CloseableUtil.closeQuietly(br);
}
}, contentEncoding);
}
}
final ExtractData extractData = new ExtractData(content);
final String[] names = metadata.names();
Arrays.sort(names);
for (final String name : names) {
extractData.putValues(name, metadata.getValues(name));
}
if (logger.isDebugEnabled()) {
logger.debug("Result: metadata: {}", metadata);
}
return extractData;
} catch (final TikaException e) {
if (e.getMessage().indexOf("bomb") >= 0) {
throw e;
}
final Throwable cause = e.getCause();
if (cause instanceof SAXException) {
final Extractor xmlExtractor = crawlerContainer.getComponent("xmlExtractor");
if (xmlExtractor != null) {
InputStream in = null;
try {
if (isByteStream) {
inputStream.reset();
in = inputStream;
} else {
in = new FileInputStream(tempFile);
}
return xmlExtractor.getText(in, params);
} finally {
CloseableUtil.closeQuietly(in);
}
}
}
throw e;
} finally {
if (originalOutStream != null) {
System.setOut(originalOutStream);
}
if (originalErrStream != null) {
System.setErr(originalErrStream);
}
try {
if (logger.isInfoEnabled()) {
final byte[] bs = outStream.toByteArray();
if (bs.length != 0) {
logger.info(new String(bs, outputEncoding));
}
}
if (logger.isWarnEnabled()) {
final byte[] bs = errStream.toByteArray();
if (bs.length != 0) {
logger.warn(new String(bs, outputEncoding));
}
}
} catch (final Exception e) {
// NOP
}
}
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
}
Aggregations