use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class AbstractFessFileTransformer method transform.
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try {
resultData.setData(SerializeUtil.fromObjectToBinary(generateData(responseData)));
} catch (final Exception e) {
throw new CrawlingAccessException("Could not serialize object", e);
}
resultData.setEncoding(fessConfig.getCrawlerCrawlingDataEncoding());
return resultData;
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class CsvDataStoreImpl method processCsv.
protected void processCsv(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final CsvConfig csvConfig, final File csvFile, final long readInterval, final String csvFileEncoding, final boolean hasHeaderLine) {
logger.info("Loading " + csvFile.getAbsolutePath());
CsvReader csvReader = null;
try {
csvReader = new CsvReader(new BufferedReader(new InputStreamReader(new FileInputStream(csvFile), csvFileEncoding)), csvConfig);
List<String> headerList = null;
if (hasHeaderLine) {
headerList = csvReader.readValues();
}
List<String> list;
boolean loop = true;
while ((list = csvReader.readValues()) != null && loop && alive) {
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
final Map<String, Object> resultMap = new LinkedHashMap<>();
resultMap.putAll(paramMap);
resultMap.put("csvfile", csvFile.getAbsolutePath());
resultMap.put("csvfilename", csvFile.getName());
resultMap.put("crawlingConfig", dataConfig);
boolean foundValues = false;
for (int i = 0; i < list.size(); i++) {
String key = null;
String value = list.get(i);
if (value == null) {
value = StringUtil.EMPTY;
}
if (StringUtil.isNotBlank(value)) {
foundValues = true;
}
if (headerList != null && headerList.size() > i) {
key = headerList.get(i);
if (StringUtil.isNotBlank(key)) {
resultMap.put(key, value);
}
}
key = CELL_PREFIX + Integer.toString(i + 1);
resultMap.put(key, value);
}
if (!foundValues) {
logger.debug("No data in line: {}", resultMap);
continue;
}
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
logger.debug(entry.getKey() + "=" + entry.getValue());
}
}
final Map<String, Object> crawlingContext = new HashMap<>();
crawlingContext.put("doc", dataMap);
resultMap.put("crawlingContext", crawlingContext);
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(entry.getValue(), resultMap);
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
logger.debug(entry.getKey() + "=" + entry.getValue());
}
}
try {
callback.store(paramMap, dataMap);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : " + dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
String url;
if (target instanceof DataStoreCrawlingException) {
final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
url = dce.getUrl();
if (dce.aborted()) {
loop = false;
}
} else {
url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, errorName, url, target);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : " + dataMap, t);
final String url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
}
if (readInterval > 0) {
sleep(readInterval);
}
}
} catch (final Exception e) {
throw new DataStoreException("Failed to crawl data when reading csv file.", e);
} finally {
IOUtils.closeQuietly(csvReader);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class TextTransformer method transform.
/*
* (non-Javadoc)
*
* @see
* org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.fess.crawler.entity.ResponseData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
if (extractorFactory == null) {
throw new CrawlerSystemException("Could not find extractorFactory.");
}
final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
String content = null;
try (final InputStream in = responseData.getResponseBody()) {
content = extractor.getText(in, params).getContent();
} catch (final Exception e) {
throw new CrawlingAccessException("Could not extract data.", e);
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try {
resultData.setData(content.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(content.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
return resultData;
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class XmlTransformer method transform.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.codelibs.fess.crawler.entity.ResponseData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
try (final InputStream is = responseData.getResponseBody()) {
final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
for (final Map.Entry<String, Object> entry : attributeMap.entrySet()) {
factory.setAttribute(entry.getKey(), entry.getValue());
}
for (final Map.Entry<String, String> entry : featureMap.entrySet()) {
factory.setFeature(entry.getKey(), "true".equalsIgnoreCase(entry.getValue()));
}
factory.setCoalescing(coalescing);
factory.setExpandEntityReferences(expandEntityRef);
factory.setIgnoringComments(ignoringComments);
factory.setIgnoringElementContentWhitespace(ignoringElementContentWhitespace);
factory.setNamespaceAware(namespaceAware);
factory.setValidating(validating);
factory.setXIncludeAware(includeAware);
final DocumentBuilder builder = factory.newDocumentBuilder();
final Document doc = builder.parse(is);
final StringBuilder buf = new StringBuilder(1000);
buf.append(getResultDataHeader());
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final List<String> nodeStrList = new ArrayList<>();
try {
final NodeList nodeList = getNodeList(doc, entry.getValue());
for (int i = 0; i < nodeList.getLength(); i++) {
final Node node = nodeList.item(i);
nodeStrList.add(node.getTextContent());
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue(), e);
}
if (nodeStrList.size() == 1) {
buf.append(getResultDataBody(entry.getKey(), nodeStrList.get(0)));
} else if (nodeStrList.size() > 1) {
buf.append(getResultDataBody(entry.getKey(), nodeStrList));
}
}
buf.append(getAdditionalData(responseData, doc));
buf.append(getResultDataFooter());
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
final String data = buf.toString().trim();
try {
resultData.setData(data.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(data.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
return resultData;
} catch (final CrawlerSystemException e) {
throw e;
} catch (final Exception e) {
throw new CrawlerSystemException("Could not store data.", e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class XpathTransformer method storeData.
@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
final DOMParser parser = getDomParser();
try (final InputStream in = responseData.getResponseBody()) {
final InputSource is = new InputSource(in);
if (responseData.getCharSet() != null) {
is.setEncoding(responseData.getCharSet());
}
parser.parse(is);
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
}
final Document document = parser.getDocument();
final StringBuilder buf = new StringBuilder(1000);
buf.append(getResultDataHeader());
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
final XObject xObj = getXPathAPI().eval(document, path);
final int type = xObj.getType();
switch(type) {
case XObject.CLASS_BOOLEAN:
final boolean b = xObj.bool();
buf.append(getResultDataBody(entry.getKey(), Boolean.toString(b)));
break;
case XObject.CLASS_NUMBER:
final double d = xObj.num();
buf.append(getResultDataBody(entry.getKey(), Double.toString(d)));
break;
case XObject.CLASS_STRING:
final String str = xObj.str();
buf.append(getResultDataBody(entry.getKey(), str.trim()));
break;
case XObject.CLASS_NODESET:
final NodeList nodeList = xObj.nodelist();
final List<String> strList = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
final Node node = nodeList.item(i);
strList.add(node.getTextContent());
}
buf.append(getResultDataBody(entry.getKey(), strList));
break;
case XObject.CLASS_RTREEFRAG:
final int rtf = xObj.rtf();
buf.append(getResultDataBody(entry.getKey(), Integer.toString(rtf)));
break;
case XObject.CLASS_NULL:
case XObject.CLASS_UNKNOWN:
case XObject.CLASS_UNRESOLVEDVARIABLE:
default:
Object obj = xObj.object();
if (obj == null) {
obj = "";
}
buf.append(getResultDataBody(entry.getKey(), obj.toString()));
break;
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
}
}
buf.append(getAdditionalData(responseData, document));
buf.append(getResultDataFooter());
final String data = buf.toString().trim();
try {
resultData.setData(data.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(data.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
}
Aggregations