use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.
the class FileSystemClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
try {
responseData.setMethod(Constants.GET_METHOD);
final String filePath = preprocessUri(uri);
responseData.setUrl(filePath);
File file = null;
try {
file = new File(new URI(filePath));
} catch (final URISyntaxException e) {
logger.warn("Could not parse url: " + filePath, e);
}
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
} else if (file.isFile()) {
// check file size
responseData.setContentLength(file.length());
checkMaxContentLength(responseData);
try {
final FileOwnerAttributeView ownerAttrView = Files.getFileAttributeView(file.toPath(), FileOwnerAttributeView.class);
if (ownerAttrView != null) {
UserPrincipal owner = ownerAttrView.getOwner();
if (owner != null) {
responseData.addMetaData(FS_FILE_USER, owner.getName());
}
}
} catch (Exception e) {
logger.warn("Failed to parse FileOwnerAttributeView.", e);
}
try {
final AclFileAttributeView aclView = Files.getFileAttributeView(file.toPath(), AclFileAttributeView.class);
if (aclView != null) {
responseData.addMetaData(FILE_ATTRIBUTE_VIEW, aclView);
responseData.addMetaData(FS_FILE_GROUPS, aclView.getAcl().stream().map(acl -> acl.principal().getName()).toArray(n -> new String[n]));
}
} catch (Exception e) {
logger.warn("Failed to parse AclFileAttributeView.", e);
}
try {
final PosixFileAttributeView posixView = Files.getFileAttributeView(file.toPath(), PosixFileAttributeView.class);
if (posixView != null) {
responseData.addMetaData(FILE_ATTRIBUTE_VIEW, posixView);
responseData.addMetaData(FS_FILE_GROUPS, new String[] { posixView.readAttributes().group().getName() });
}
} catch (Exception e) {
logger.warn("Failed to parse PosixFileAttributeView.", e);
}
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(geCharSet(file));
responseData.setLastModified(new Date(file.lastModified()));
if (file.canRead()) {
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
}
}
if (includeContent) {
if (file.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(file))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
}
} else {
responseData.setResponseBody(file, false);
}
}
} else {
// Forbidden
responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
responseData.setMimeType(APPLICATION_OCTET_STREAM);
}
} else if (file.isDirectory()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
final File[] files = file.listFiles();
if (files != null) {
for (final File f : files) {
final String chileUri = f.toURI().toASCIIString();
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
}
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final Exception e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.
the class EmlExtractor method appendAttachment.
protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) {
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
try {
final String filename = getDecodeText(bodyPart.getFileName());
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try (final InputStream in = bodyPart.getInputStream()) {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
final String content = extractor.getText(in, map).getContent();
buf.append(content).append(' ');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
} catch (MessagingException e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in parsing BodyPart.", e);
}
}
}
use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.
the class FtpClient method updateResponseData.
protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
}
if (file.isSymbolicLink()) {
final String link = file.getLink();
String redirect = null;
if (link == null) {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
} else if (link.startsWith("/")) {
redirect = ftpInfo.toUrl(file.getLink());
} else if (link.startsWith("../")) {
redirect = ftpInfo.toChildUrl(file.getLink());
} else {
redirect = ftpInfo.toChildUrl("../" + file.getLink());
}
if (!uri.equals(redirect)) {
responseData.setHttpStatusCode(Constants.OK_STATUS);
responseData.setCharSet(charset);
responseData.setContentLength(0);
responseData.setRedirectLocation(redirect);
ftpClientQueue.offer(client);
return;
}
}
if (file.isFile()) {
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(Constants.UTF_8);
responseData.setLastModified(file.getTimestamp().getTime());
// check file size
responseData.setContentLength(file.getSize());
checkMaxContentLength(responseData);
if (file.getUser() != null) {
responseData.addMetaData(FTP_FILE_USER, file.getUser());
}
if (file.getGroup() != null) {
responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
}
if (includeContent) {
File tempFile = null;
File outputFile = null;
try {
tempFile = File.createTempFile("ftp-", ".tmp");
try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
if (!client.retrieveFile(ftpInfo.getName(), out)) {
throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
}
}
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (InputStream is = new FileInputStream(tempFile)) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
}
}
responseData.setCharSet(geCharSet(tempFile));
if (tempFile.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
}
} else {
outputFile = File.createTempFile("crawler-FtpClient-", ".out");
CopyUtil.copy(tempFile, outputFile);
responseData.setResponseBody(outputFile, true);
}
ftpClientQueue.offer(client);
} catch (final CrawlingAccessException e) {
ftpClientQueue.offer(client);
throw e;
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
disconnectInternalClient(client);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
} finally {
if (tempFile != null && !tempFile.delete()) {
logger.warn("Could not delete " + tempFile.getAbsolutePath());
}
}
}
} else if (file.isDirectory() || file.isSymbolicLink()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
try {
final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : ftpFiles) {
final String chileUri = ftpInfo.toChildUrl(f.getName());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
} catch (final IOException e) {
disconnectInternalClient(client);
throw new CrawlingAccessException("Could not access " + uri, e);
}
}
ftpClientQueue.offer(client);
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
}
}
use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.
the class TarExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory));
}
Aggregations