Search in sources :

Example 1 with MimeTypeHelper

use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.

the class FileSystemClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    try {
        responseData.setMethod(Constants.GET_METHOD);
        final String filePath = preprocessUri(uri);
        responseData.setUrl(filePath);
        File file = null;
        try {
            file = new File(new URI(filePath));
        } catch (final URISyntaxException e) {
            logger.warn("Could not parse url: " + filePath, e);
        }
        if (file == null) {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        } else if (file.isFile()) {
            // check file size
            responseData.setContentLength(file.length());
            checkMaxContentLength(responseData);
            try {
                final FileOwnerAttributeView ownerAttrView = Files.getFileAttributeView(file.toPath(), FileOwnerAttributeView.class);
                if (ownerAttrView != null) {
                    UserPrincipal owner = ownerAttrView.getOwner();
                    if (owner != null) {
                        responseData.addMetaData(FS_FILE_USER, owner.getName());
                    }
                }
            } catch (Exception e) {
                logger.warn("Failed to parse FileOwnerAttributeView.", e);
            }
            try {
                final AclFileAttributeView aclView = Files.getFileAttributeView(file.toPath(), AclFileAttributeView.class);
                if (aclView != null) {
                    responseData.addMetaData(FILE_ATTRIBUTE_VIEW, aclView);
                    responseData.addMetaData(FS_FILE_GROUPS, aclView.getAcl().stream().map(acl -> acl.principal().getName()).toArray(n -> new String[n]));
                }
            } catch (Exception e) {
                logger.warn("Failed to parse AclFileAttributeView.", e);
            }
            try {
                final PosixFileAttributeView posixView = Files.getFileAttributeView(file.toPath(), PosixFileAttributeView.class);
                if (posixView != null) {
                    responseData.addMetaData(FILE_ATTRIBUTE_VIEW, posixView);
                    responseData.addMetaData(FS_FILE_GROUPS, new String[] { posixView.readAttributes().group().getName() });
                }
            } catch (Exception e) {
                logger.warn("Failed to parse PosixFileAttributeView.", e);
            }
            responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
            responseData.setCharSet(geCharSet(file));
            responseData.setLastModified(new Date(file.lastModified()));
            if (file.canRead()) {
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                } catch (final Exception e) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
                    }
                }
                if (includeContent) {
                    if (file.length() < maxCachedContentSize) {
                        try (InputStream contentStream = new BufferedInputStream(new FileInputStream(file))) {
                            responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                        }
                    } else {
                        responseData.setResponseBody(file, false);
                    }
                }
            } else {
                // Forbidden
                responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
                responseData.setMimeType(APPLICATION_OCTET_STREAM);
            }
        } else if (file.isDirectory()) {
            final Set<RequestData> requestDataSet = new HashSet<>();
            if (includeContent) {
                final File[] files = file.listFiles();
                if (files != null) {
                    for (final File f : files) {
                        final String chileUri = f.toURI().toASCIIString();
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                }
            }
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        } else {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        }
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final Exception e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : FileOwnerAttributeView(java.nio.file.attribute.FileOwnerAttributeView) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) BufferedInputStream(java.io.BufferedInputStream) Date(java.util.Date) URISyntaxException(java.net.URISyntaxException) PosixFileAttributeView(java.nio.file.attribute.PosixFileAttributeView) LoggerFactory(org.slf4j.LoggerFactory) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) AbstractCrawlerClient(org.codelibs.fess.crawler.client.AbstractCrawlerClient) HashSet(java.util.HashSet) UserPrincipal(java.nio.file.attribute.UserPrincipal) URI(java.net.URI) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) AclFileAttributeView(java.nio.file.attribute.AclFileAttributeView) Logger(org.slf4j.Logger) Files(java.nio.file.Files) Resource(javax.annotation.Resource) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) FileInputStream(java.io.FileInputStream) FileOwnerAttributeView(java.nio.file.attribute.FileOwnerAttributeView) CrawlerContainer(org.codelibs.fess.crawler.container.CrawlerContainer) File(java.io.File) CloseableUtil(org.codelibs.core.io.CloseableUtil) Constants(org.codelibs.fess.crawler.Constants) URLEncoder(java.net.URLEncoder) RequestData(org.codelibs.fess.crawler.entity.RequestData) AccessTimeoutTarget(org.codelibs.fess.crawler.client.AccessTimeoutTarget) TimeoutManager(org.codelibs.core.timer.TimeoutManager) TimeoutTask(org.codelibs.core.timer.TimeoutTask) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) AclFileAttributeView(java.nio.file.attribute.AclFileAttributeView) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) UserPrincipal(java.nio.file.attribute.UserPrincipal) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) URISyntaxException(java.net.URISyntaxException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Date(java.util.Date) FileInputStream(java.io.FileInputStream) PosixFileAttributeView(java.nio.file.attribute.PosixFileAttributeView) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) File(java.io.File) HashSet(java.util.HashSet)

Example 2 with MimeTypeHelper

use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.

the class EmlExtractor method appendAttachment.

protected void appendAttachment(final StringBuilder buf, final BodyPart bodyPart) {
    final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
    final ExtractorFactory extractorFactory = getExtractorFactory();
    try {
        final String filename = getDecodeText(bodyPart.getFileName());
        final String mimeType = mimeTypeHelper.getContentType(null, filename);
        if (mimeType != null) {
            final Extractor extractor = extractorFactory.getExtractor(mimeType);
            if (extractor != null) {
                try (final InputStream in = bodyPart.getInputStream()) {
                    final Map<String, String> map = new HashMap<>();
                    map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                    final String content = extractor.getText(in, map).getContent();
                    buf.append(content).append(' ');
                } catch (final Exception e) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Exception in an internal extractor.", e);
                    }
                }
            }
        }
    } catch (MessagingException e) {
        if (logger.isDebugEnabled()) {
            logger.debug("Exception in parsing BodyPart.", e);
        }
    }
}
Also used : HashMap(java.util.HashMap) MessagingException(javax.mail.MessagingException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) InputStream(java.io.InputStream) Extractor(org.codelibs.fess.crawler.extractor.Extractor) MessagingException(javax.mail.MessagingException) ParseException(java.text.ParseException) IOException(java.io.IOException) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Example 3 with MimeTypeHelper

use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.

the class ZipExtractor method getText.

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }
    final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
    final ExtractorFactory extractorFactory = getExtractorFactory();
    final StringBuilder buf = new StringBuilder(1000);
    try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
        ZipArchiveEntry entry = null;
        long contentSize = 0;
        while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
            contentSize += entry.getSize();
            if (maxContentSize != -1 && contentSize > maxContentSize) {
                throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
            }
            final String filename = entry.getName();
            final String mimeType = mimeTypeHelper.getContentType(null, filename);
            if (mimeType != null) {
                final Extractor extractor = extractorFactory.getExtractor(mimeType);
                if (extractor != null) {
                    try {
                        final Map<String, String> map = new HashMap<>();
                        map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                        buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
                        buf.append('\n');
                    } catch (final Exception e) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("Exception in an internal extractor.", e);
                        }
                    }
                }
            }
        }
    } catch (final MaxLengthExceededException e) {
        throw e;
    } catch (final Exception e) {
        if (buf.length() == 0) {
            throw new ExtractException("Could not extract a content.", e);
        }
    }
    return new ExtractData(buf.toString().trim());
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) HashMap(java.util.HashMap) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ArchiveInputStream(org.apache.commons.compress.archivers.ArchiveInputStream) BufferedInputStream(java.io.BufferedInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry) Extractor(org.codelibs.fess.crawler.extractor.Extractor) IgnoreCloseInputStream(org.codelibs.fess.crawler.util.IgnoreCloseInputStream)

Example 4 with MimeTypeHelper

use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.

the class FtpClient method updateResponseData.

protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
    if (file == null) {
        responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
        responseData.setCharSet(charset);
        responseData.setContentLength(0);
        ftpClientQueue.offer(client);
        return;
    }
    if (file.isSymbolicLink()) {
        final String link = file.getLink();
        String redirect = null;
        if (link == null) {
            responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
            ftpClientQueue.offer(client);
            return;
        } else if (link.startsWith("/")) {
            redirect = ftpInfo.toUrl(file.getLink());
        } else if (link.startsWith("../")) {
            redirect = ftpInfo.toChildUrl(file.getLink());
        } else {
            redirect = ftpInfo.toChildUrl("../" + file.getLink());
        }
        if (!uri.equals(redirect)) {
            responseData.setHttpStatusCode(Constants.OK_STATUS);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
            responseData.setRedirectLocation(redirect);
            ftpClientQueue.offer(client);
            return;
        }
    }
    if (file.isFile()) {
        responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
        responseData.setCharSet(Constants.UTF_8);
        responseData.setLastModified(file.getTimestamp().getTime());
        // check file size
        responseData.setContentLength(file.getSize());
        checkMaxContentLength(responseData);
        if (file.getUser() != null) {
            responseData.addMetaData(FTP_FILE_USER, file.getUser());
        }
        if (file.getGroup() != null) {
            responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
        }
        if (includeContent) {
            File tempFile = null;
            File outputFile = null;
            try {
                tempFile = File.createTempFile("ftp-", ".tmp");
                try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
                    if (!client.retrieveFile(ftpInfo.getName(), out)) {
                        throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
                    }
                }
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                try (InputStream is = new FileInputStream(tempFile)) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                } catch (final Exception e) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
                    }
                }
                responseData.setCharSet(geCharSet(tempFile));
                if (tempFile.length() < maxCachedContentSize) {
                    try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
                        responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                    }
                } else {
                    outputFile = File.createTempFile("crawler-FtpClient-", ".out");
                    CopyUtil.copy(tempFile, outputFile);
                    responseData.setResponseBody(outputFile, true);
                }
                ftpClientQueue.offer(client);
            } catch (final CrawlingAccessException e) {
                ftpClientQueue.offer(client);
                throw e;
            } catch (final Exception e) {
                logger.warn("I/O Exception.", e);
                disconnectInternalClient(client);
                responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
            } finally {
                if (tempFile != null && !tempFile.delete()) {
                    logger.warn("Could not delete " + tempFile.getAbsolutePath());
                }
            }
        }
    } else if (file.isDirectory() || file.isSymbolicLink()) {
        final Set<RequestData> requestDataSet = new HashSet<>();
        if (includeContent) {
            try {
                final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
                validateRequest(client);
                for (final FTPFile f : ftpFiles) {
                    final String chileUri = ftpInfo.toChildUrl(f.getName());
                    requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                }
            } catch (final IOException e) {
                disconnectInternalClient(client);
                throw new CrawlingAccessException("Could not access " + uri, e);
            }
        }
        ftpClientQueue.offer(client);
        throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
    } else {
        responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
        responseData.setCharSet(charset);
        responseData.setContentLength(0);
        ftpClientQueue.offer(client);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) HashSet(java.util.HashSet) Set(java.util.Set) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FTPFile(org.apache.commons.net.ftp.FTPFile) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerLoginFailureException(org.codelibs.fess.crawler.exception.CrawlerLoginFailureException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) BufferedInputStream(java.io.BufferedInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) FTPFile(org.apache.commons.net.ftp.FTPFile) BufferedOutputStream(java.io.BufferedOutputStream)

Example 5 with MimeTypeHelper

use of org.codelibs.fess.crawler.helper.MimeTypeHelper in project fess-crawler by codelibs.

the class TarExtractor method getText.

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }
    final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
    final ExtractorFactory extractorFactory = getExtractorFactory();
    return new ExtractData(getTextInternal(in, mimeTypeHelper, extractorFactory));
}
Also used : ExtractData(org.codelibs.fess.crawler.entity.ExtractData) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Aggregations

MimeTypeHelper (org.codelibs.fess.crawler.helper.MimeTypeHelper)8 InputStream (java.io.InputStream)6 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)6 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)5 BufferedInputStream (java.io.BufferedInputStream)4 File (java.io.File)4 IOException (java.io.IOException)4 ExtractorFactory (org.codelibs.fess.crawler.extractor.ExtractorFactory)4 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 ExtractException (org.codelibs.fess.crawler.exception.ExtractException)3 Extractor (org.codelibs.fess.crawler.extractor.Extractor)3 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 MalformedURLException (java.net.MalformedURLException)2 Date (java.util.Date)2 Set (java.util.Set)2 ExtractData (org.codelibs.fess.crawler.entity.ExtractData)2 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)2