use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class ApiExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (logger.isDebugEnabled()) {
logger.debug("Accessing " + url);
}
// start
AccessTimeoutTarget accessTimeoutTarget = null;
TimeoutTask accessTimeoutTask = null;
if (accessTimeout != null) {
accessTimeoutTarget = new AccessTimeoutTarget(Thread.currentThread());
accessTimeoutTask = TimeoutManager.getInstance().addTimeoutTarget(accessTimeoutTarget, accessTimeout.intValue(), false);
}
final ExtractData data = new ExtractData();
final HttpPost httpPost = new HttpPost(url);
final HttpEntity postEntity = MultipartEntityBuilder.create().setMode(HttpMultipartMode.BROWSER_COMPATIBLE).setCharset(Charset.forName("UTF-8")).addBinaryBody("filedata", in).build();
httpPost.setEntity(postEntity);
try (CloseableHttpResponse response = httpClient.execute(httpPost)) {
if (response.getStatusLine().getStatusCode() != Constants.OK_STATUS_CODE) {
logger.error("Failed to access " + url + ", code: " + response.getStatusLine().getStatusCode() + ".");
return null;
}
data.setContent(EntityUtils.toString(response.getEntity(), Charsets.UTF_8));
final Header[] headers = response.getAllHeaders();
for (final Header header : headers) {
data.putValue(header.getName(), header.getValue());
}
} catch (final IOException e) {
throw new ExtractException(e);
} finally {
if (accessTimeout != null) {
accessTimeoutTarget.stop();
if (!accessTimeoutTask.isCanceled()) {
accessTimeoutTask.cancel();
}
}
}
return data;
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class EmlExtractor method getBodyText.
protected String getBodyText(final MimeMessage message) {
StringBuilder buf = new StringBuilder(1000);
try {
final Object content = message.getContent();
if (content instanceof Multipart) {
final Multipart multipart = (Multipart) content;
final int count = multipart.getCount();
for (int i = 0; i < count; i++) {
final BodyPart bodyPart = multipart.getBodyPart(i);
if (Part.ATTACHMENT.equalsIgnoreCase(bodyPart.getDisposition())) {
appendAttachment(buf, bodyPart);
} else if (bodyPart.isMimeType("text/plain")) {
buf.append(bodyPart.getContent().toString()).append(' ');
} else if (bodyPart.isMimeType("text/html")) {
buf.append(bodyPart.getContent().toString()).append(' ');
} else if (bodyPart.isMimeType("multipart/alternative") && bodyPart.getContent() instanceof Multipart) {
final Multipart alternativePart = (Multipart) bodyPart.getContent();
for (int j = 0; j < alternativePart.getCount(); j++) {
final BodyPart innerBodyPart = alternativePart.getBodyPart(j);
if (innerBodyPart.isMimeType("text/plain")) {
buf.append(innerBodyPart.getContent().toString()).append(' ');
break;
}
}
}
}
} else if (content instanceof String) {
buf.append(content.toString());
}
} catch (MessagingException | IOException e) {
throw new ExtractException(e);
}
return buf.toString();
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class HtmlXpathExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
try {
final BufferedInputStream bis = new BufferedInputStream(in);
final String enc = getEncoding(bis);
final DOMParser parser = getDomParser();
final InputSource inputSource = new InputSource(bis);
inputSource.setEncoding(enc);
parser.parse(inputSource);
final Document document = parser.getDocument();
final StringBuilder buf = new StringBuilder(255);
final NodeList nodeList = getXPathAPI().selectNodeList(document, targetNodePath);
for (int i = 0; i < nodeList.getLength(); i++) {
final Node node = nodeList.item(i);
buf.append(node.getTextContent()).append(' ');
}
return new ExtractData(buf.toString().replaceAll("\\s+", " ").trim());
} catch (final Exception e) {
throw new ExtractException(e);
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class TarExtractor method getTextInternal.
protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) {
final StringBuilder buf = new StringBuilder(1000);
ArchiveInputStream ais = null;
try {
ais = archiveStreamFactory.createArchiveInputStream("tar", in);
TarArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
} finally {
CloseableUtil.closeQuietly(ais);
}
return buf.toString().trim();
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
Aggregations