use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class JodExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("in is null.");
}
final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
String extension;
String filePrefix;
if (StringUtil.isNotBlank(resourceName)) {
final String name = getFileName(resourceName);
final String[] strings = name.split("\\.");
final StringBuilder buf = new StringBuilder(100);
if (strings.length > 1) {
for (int i = 0; i < strings.length - 1; i++) {
if (buf.length() != 0) {
buf.append('.');
}
buf.append(strings[i]);
}
filePrefix = buf.toString();
extension = strings[strings.length - 1];
} else {
filePrefix = name;
extension = "";
}
} else {
filePrefix = "none";
extension = "";
}
File inputFile = null;
File outputFile = null;
try {
inputFile = File.createTempFile("jodextin_" + filePrefix + "_", StringUtil.isNotBlank(extension) ? "." + extension : extension, tempDir);
final String outExt = getOutputExtension(extension);
outputFile = File.createTempFile("cmdextout_" + filePrefix + "_", "." + outExt, tempDir);
// store to a file
CopyUtil.copy(in, inputFile);
final OfficeDocumentConverter converter = new OfficeDocumentConverter(officeManager);
converter.convert(inputFile, outputFile);
final ExtractData extractData = new ExtractData(getOutputContent(outputFile, outExt));
if (StringUtil.isNotBlank(resourceName)) {
extractData.putValues("resourceName", new String[] { resourceName });
}
return extractData;
} catch (final IOException e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (inputFile != null && !inputFile.delete()) {
logger.info("Failed to delete " + inputFile.getAbsolutePath());
}
if (outputFile != null && !outputFile.delete()) {
logger.info("Failed to delete " + outputFile.getAbsolutePath());
}
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class LhaExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
File tempFile = null;
LhaFile lhaFile = null;
try {
tempFile = File.createTempFile("crawler-", ".lzh");
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
CopyUtil.copy(in, fos);
}
lhaFile = new LhaFile(tempFile);
@SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
long contentSize = 0;
while (entries.hasMoreElements()) {
final LhaHeader head = entries.nextElement();
contentSize += head.getOriginalSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = head.getPath();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
InputStream is = null;
try {
is = lhaFile.getInputStream(head);
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
} finally {
CloseableUtil.closeQuietly(is);
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (lhaFile != null) {
try {
lhaFile.close();
} catch (final IOException e) {
// ignore
}
}
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class SmbClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
responseData.setMethod(Constants.GET_METHOD);
final String filePath = preprocessUri(uri);
responseData.setUrl(filePath);
SmbFile file = null;
final SmbAuthentication smbAuthentication = smbAuthenticationHolder.get(filePath);
if (logger.isDebugEnabled()) {
logger.debug("Creating SmbFile: " + filePath);
}
try {
if (smbAuthentication == null) {
file = new SmbFile(filePath);
} else {
file = new SmbFile(filePath, smbAuthentication.getAuthentication());
}
} catch (final MalformedURLException e) {
logger.warn("Could not parse url: " + filePath, e);
}
if (logger.isDebugEnabled()) {
logger.debug("Processing SmbFile: " + filePath);
}
try {
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
} else if (file.isFile()) {
if (logger.isDebugEnabled()) {
logger.debug("Checking SmbFile Size: " + filePath);
}
responseData.setContentLength(file.length());
checkMaxContentLength(responseData);
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(geCharSet(file));
responseData.setLastModified(new Date(file.lastModified()));
responseData.addMetaData(SMB_CREATE_TIME, new Date(file.createTime()));
try {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile Owner: " + filePath);
}
final SID ownerUser = file.getOwnerUser();
if (ownerUser != null) {
final String[] ownerAttributes = { ownerUser.getAccountName(), ownerUser.getDomainName() };
responseData.addMetaData(SMB_OWNER_ATTRIBUTES, ownerAttributes);
}
} catch (final IOException e) {
logger.warn("Cannot get owner of the file: " + filePath);
}
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile ACL: " + filePath);
}
processAccessControlEntries(responseData, file);
final Map<String, List<String>> headerFieldMap = file.getHeaderFields();
if (headerFieldMap != null) {
for (final Map.Entry<String, List<String>> entry : headerFieldMap.entrySet()) {
responseData.addMetaData(entry.getKey(), entry.getValue());
}
}
if (file.canRead()) {
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
if (includeContent) {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile Content: " + filePath);
}
if (file.getContentLength() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new SmbFileInputStream(file))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
}
} else {
File outputFile = null;
try {
outputFile = File.createTempFile("crawler-SmbClient-", ".out");
copy(file, outputFile);
responseData.setResponseBody(outputFile, true);
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
if (outputFile != null && !outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile MIME Type: " + filePath);
}
try (final InputStream is = responseData.getResponseBody()) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
} else {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile MIME Type: " + filePath);
}
try (final InputStream is = new SmbFileInputStream(file)) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
}
}
} else {
// Forbidden
responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
responseData.setMimeType(APPLICATION_OCTET_STREAM);
}
} else if (file.isDirectory()) {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile Directory: " + filePath);
}
final Set<RequestData> requestDataSet = new HashSet<>(100);
if (includeContent) {
final SmbFile[] files = file.listFiles();
if (files != null) {
for (final SmbFile f : files) {
final String chileUri = f.toString();
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
}
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final SmbException e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class XmlUtil method getDataMap.
public static Map<String, Object> getDataMap(final AccessResultData<?> accessResultData) {
// create input source
final InputSource is = new InputSource(new ByteArrayInputStream(accessResultData.getData()));
if (StringUtil.isNotBlank(accessResultData.getEncoding())) {
is.setEncoding(accessResultData.getEncoding());
}
// create handler
final DocHandler handler = new DocHandler();
// create a sax instance
final SAXParserFactory spfactory = SAXParserFactory.newInstance();
try {
// create a sax parser
final SAXParser parser = spfactory.newSAXParser();
// parse a content
parser.parse(is, handler);
return handler.getDataMap();
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create a data map from XML content.", e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class BinaryTransformer method transform.
/*
* (non-Javadoc)
*
* @see
* org.codelibs.fess.crawler.transformer.Transformer#getData(org.codelibs.fess.crawler.entity
* .AccessResultData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try (BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
resultData.setData(IOUtils.toByteArray(bis));
resultData.setEncoding(responseData.getCharSet());
return resultData;
} catch (final IOException e) {
throw new CrawlerSystemException("Could not convert the input stream.", e);
}
}
Aggregations