use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class FessXpathTransformer method storeData.
@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
final DOMParser parser = getDomParser();
try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
bis.mark(UTF8_BOM_SIZE);
final int size = bis.read(bomBytes);
if (size < 3 || !isUtf8BomBytes(bomBytes)) {
bis.reset();
}
final InputSource is = new InputSource(bis);
if (responseData.getCharSet() != null) {
is.setEncoding(responseData.getCharSet());
}
parser.parse(is);
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
}
final Document document = parser.getDocument();
processMetaRobots(responseData, resultData, document);
processXRobotsTag(responseData, resultData);
final Map<String, Object> dataMap = new LinkedHashMap<>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
final XObject xObj = getXPathAPI().eval(document, path);
final int type = xObj.getType();
switch(type) {
case XObject.CLASS_BOOLEAN:
final boolean b = xObj.bool();
putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
break;
case XObject.CLASS_NUMBER:
final double d = xObj.num();
putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
break;
case XObject.CLASS_STRING:
final String str = xObj.str();
putResultDataBody(dataMap, entry.getKey(), str);
break;
case XObject.CLASS_NULL:
case XObject.CLASS_UNKNOWN:
case XObject.CLASS_NODESET:
case XObject.CLASS_RTREEFRAG:
case XObject.CLASS_UNRESOLVEDVARIABLE:
default:
final Boolean isPruned = fieldPrunedRuleMap.get(entry.getKey());
Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
if (value != null && isPruned != null && isPruned.booleanValue()) {
value = pruneNode(value);
}
putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
break;
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of {}:{}", entry.getKey(), entry.getValue(), e);
}
}
putAdditionalData(dataMap, responseData, document);
normalizeData(responseData, dataMap);
try {
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
} catch (final Exception e) {
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
}
resultData.setEncoding(charsetName);
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class DatabaseDataStoreImpl method storeData.
@Override
protected void storeData(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final long readInterval = getReadInterval(paramMap);
Connection con = null;
Statement stmt = null;
ResultSet rs = null;
try {
Class.forName(getDriverClass(paramMap));
final String jdbcUrl = getUrl(paramMap);
final String username = getUsername(paramMap);
final String password = getPassword(paramMap);
if (StringUtil.isNotEmpty(username)) {
con = DriverManager.getConnection(jdbcUrl, username, password);
} else {
con = DriverManager.getConnection(jdbcUrl);
}
final String sql = getSql(paramMap);
stmt = con.createStatement();
// SQL generated by an administrator
rs = stmt.executeQuery(sql);
boolean loop = true;
while (rs.next() && loop && alive) {
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
final Map<String, Object> crawlingContext = new HashMap<>();
crawlingContext.put("doc", dataMap);
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(entry.getValue(), new ResultSetParamMap(config, crawlingContext, rs, paramMap));
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
try {
callback.store(paramMap, dataMap);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : " + dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
String url;
if (target instanceof DataStoreCrawlingException) {
final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
url = dce.getUrl();
if (dce.aborted()) {
loop = false;
}
} else {
url = sql + ":" + rs.getRow();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(config, errorName, url, target);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : " + dataMap, t);
final String url = sql + ":" + rs.getRow();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(config, t.getClass().getCanonicalName(), url, t);
}
if (readInterval > 0) {
sleep(readInterval);
}
}
} catch (final Exception e) {
throw new DataStoreException("Failed to crawl data in DB.", e);
} finally {
try {
if (rs != null) {
rs.close();
}
} catch (final SQLException e) {
logger.warn("Failed to close a result set.", e);
} finally {
try {
if (stmt != null) {
stmt.close();
}
} catch (final SQLException e) {
logger.warn("Failed to close a statement.", e);
} finally {
try {
if (con != null) {
con.close();
}
} catch (final SQLException e) {
logger.warn("Failed to close a db connection.", e);
}
}
}
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class EsDataStoreImpl method processData.
protected void processData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final long readInterval, final Client client) {
final boolean deleteProcessedDoc = paramMap.getOrDefault("delete.processed.doc", Constants.FALSE).equalsIgnoreCase(Constants.TRUE);
final String[] indices;
if (paramMap.containsKey(INDEX)) {
indices = paramMap.get(INDEX).trim().split(",");
} else {
indices = new String[] { "_all" };
}
final String scroll = paramMap.containsKey(SCROLL) ? paramMap.get(SCROLL).trim() : "1m";
final String timeout = paramMap.containsKey(TIMEOUT) ? paramMap.get(TIMEOUT).trim() : "1m";
final SearchRequestBuilder builder = client.prepareSearch(indices);
if (paramMap.containsKey(TYPE)) {
builder.setTypes(paramMap.get(TYPE).trim().split(","));
}
if (paramMap.containsKey(SIZE)) {
builder.setSize(Integer.parseInt(paramMap.get(SIZE)));
}
if (paramMap.containsKey(FIELDS)) {
builder.setFetchSource(paramMap.get(FIELDS).trim().split(","), null);
}
builder.setQuery(QueryBuilders.wrapperQuery(paramMap.containsKey(QUERY) ? paramMap.get(QUERY).trim() : "{\"match_all\":{}}"));
builder.setScroll(scroll);
builder.setPreference(paramMap.containsKey(PREFERENCE) ? paramMap.get(PREFERENCE).trim() : Constants.SEARCH_PREFERENCE_PRIMARY);
try {
SearchResponse response = builder.execute().actionGet(timeout);
String scrollId = response.getScrollId();
while (scrollId != null) {
final SearchHits searchHits = response.getHits();
final SearchHit[] hits = searchHits.getHits();
if (hits.length == 0) {
scrollId = null;
break;
}
boolean loop = true;
final BulkRequestBuilder bulkRequest = deleteProcessedDoc ? client.prepareBulk() : null;
for (final SearchHit hit : hits) {
if (!alive || !loop) {
break;
}
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
final Map<String, Object> resultMap = new LinkedHashMap<>();
resultMap.putAll(paramMap);
resultMap.put("index", hit.getIndex());
resultMap.put("type", hit.getType());
resultMap.put("id", hit.getId());
resultMap.put("version", Long.valueOf(hit.getVersion()));
resultMap.put("hit", hit);
resultMap.put("source", hit.getSource());
resultMap.put("crawlingConfig", dataConfig);
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
logger.debug(entry.getKey() + "=" + entry.getValue());
}
}
final Map<String, Object> crawlingContext = new HashMap<>();
crawlingContext.put("doc", dataMap);
resultMap.put("crawlingContext", crawlingContext);
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(entry.getValue(), resultMap);
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
logger.debug(entry.getKey() + "=" + entry.getValue());
}
}
try {
callback.store(paramMap, dataMap);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : " + dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
String url;
if (target instanceof DataStoreCrawlingException) {
final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
url = dce.getUrl();
if (dce.aborted()) {
loop = false;
}
} else {
url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, errorName, url, target);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : " + dataMap, t);
final String url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
}
if (bulkRequest != null) {
bulkRequest.add(client.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()));
}
if (readInterval > 0) {
sleep(readInterval);
}
}
if (bulkRequest != null && bulkRequest.numberOfActions() > 0) {
final BulkResponse bulkResponse = bulkRequest.execute().actionGet(timeout);
if (bulkResponse.hasFailures()) {
logger.warn(bulkResponse.buildFailureMessage());
}
}
if (!alive) {
break;
}
response = client.prepareSearchScroll(scrollId).setScroll(scroll).execute().actionGet(timeout);
scrollId = response.getScrollId();
}
} catch (final Exception e) {
throw new DataStoreException("Failed to crawl data when acessing elasticsearch.", e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class AbstractFessFileTransformer method generateData.
protected Map<String, Object> generateData(final ResponseData responseData) {
final Extractor extractor = getExtractor(responseData);
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
final String mimeType = responseData.getMimeType();
params.put(HttpHeaders.CONTENT_TYPE, mimeType);
params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
final StringBuilder contentMetaBuf = new StringBuilder(1000);
final Map<String, Object> dataMap = new HashMap<>();
final Map<String, Object> metaDataMap = new HashMap<>();
String content;
try (final InputStream in = responseData.getResponseBody()) {
final ExtractData extractData = getExtractData(extractor, in, params);
content = extractData.getContent();
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
return null;
}
if (getLogger().isDebugEnabled()) {
getLogger().debug("ExtractData: " + extractData);
}
// meta
//
extractData.getKeySet().stream().filter(//
k -> extractData.getValues(k) != null).forEach(key -> {
final String[] values = extractData.getValues(key);
metaDataMap.put(key, values);
if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
final String joinedValue = StringUtils.join(values, ' ');
if (StringUtil.isNotBlank(joinedValue)) {
if (contentMetaBuf.length() > 0) {
contentMetaBuf.append(' ');
}
contentMetaBuf.append(joinedValue.trim());
}
}
final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
if (mapping != null) {
if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), values);
} else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
final String joinedValue = StringUtils.join(values, ' ');
dataMap.put(mapping.getFirst(), joinedValue.trim());
} else if (values.length == 1) {
try {
if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
} else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
} else {
logger.warn("Unknown mapping type: {}={}", key, mapping);
}
} catch (final NumberFormatException e) {
logger.warn("Failed to parse " + values[0], e);
}
}
}
});
} catch (final Exception e) {
final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
rcae.setLogLevel(CrawlingAccessException.WARN);
throw rcae;
}
if (content == null) {
content = StringUtil.EMPTY;
}
final String contentMeta = contentMetaBuf.toString().trim();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// content
final StringBuilder buf = new StringBuilder(content.length() + 1000);
if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
buf.append(content);
}
if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
if (buf.length() > 0) {
buf.append(' ');
}
buf.append(contentMeta);
}
final String bodyBase = buf.toString().trim();
final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
// text cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
}
}
// digest
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
final String fileName = getFileName(url, urlEncoding);
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
if (url.endsWith("/")) {
if (StringUtil.isNotBlank(content)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
}
} else {
if (StringUtil.isBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
}
}
}
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
// filename
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// TODO anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
final Set<String> labelTypeSet = new HashSet<>();
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
labelTypeSet.add(labelType);
}
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
// role: roleType
final List<String> roleTypeList = getRoleTypes(responseData);
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// lang
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
}
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// from config
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
final String key = entry.getKey();
final String[] values = entry.getValue().split(",");
for (final String value : values) {
putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
}
}
final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
final String key = entry.getKey();
putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
}
return dataMap;
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class FtpClient method updateResponseData.
protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
}
if (file.isSymbolicLink()) {
final String link = file.getLink();
String redirect = null;
if (link == null) {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
} else if (link.startsWith("/")) {
redirect = ftpInfo.toUrl(file.getLink());
} else if (link.startsWith("../")) {
redirect = ftpInfo.toChildUrl(file.getLink());
} else {
redirect = ftpInfo.toChildUrl("../" + file.getLink());
}
if (!uri.equals(redirect)) {
responseData.setHttpStatusCode(Constants.OK_STATUS);
responseData.setCharSet(charset);
responseData.setContentLength(0);
responseData.setRedirectLocation(redirect);
ftpClientQueue.offer(client);
return;
}
}
if (file.isFile()) {
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(Constants.UTF_8);
responseData.setLastModified(file.getTimestamp().getTime());
// check file size
responseData.setContentLength(file.getSize());
checkMaxContentLength(responseData);
if (file.getUser() != null) {
responseData.addMetaData(FTP_FILE_USER, file.getUser());
}
if (file.getGroup() != null) {
responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
}
if (includeContent) {
File tempFile = null;
File outputFile = null;
try {
tempFile = File.createTempFile("ftp-", ".tmp");
try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
if (!client.retrieveFile(ftpInfo.getName(), out)) {
throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
}
}
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (InputStream is = new FileInputStream(tempFile)) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
}
}
responseData.setCharSet(geCharSet(tempFile));
if (tempFile.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
}
} else {
outputFile = File.createTempFile("crawler-FtpClient-", ".out");
CopyUtil.copy(tempFile, outputFile);
responseData.setResponseBody(outputFile, true);
}
ftpClientQueue.offer(client);
} catch (final CrawlingAccessException e) {
ftpClientQueue.offer(client);
throw e;
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
disconnectInternalClient(client);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
} finally {
if (tempFile != null && !tempFile.delete()) {
logger.warn("Could not delete " + tempFile.getAbsolutePath());
}
}
}
} else if (file.isDirectory() || file.isSymbolicLink()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
try {
final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : ftpFiles) {
final String chileUri = ftpInfo.toChildUrl(f.getName());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
} catch (final IOException e) {
disconnectInternalClient(client);
throw new CrawlingAccessException("Could not access " + uri, e);
}
}
ftpClientQueue.offer(client);
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
}
}
Aggregations