Search in sources :

Example 1 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class AbstractFessFileTransformer method transform.

@Override
public ResultData transform(final ResponseData responseData) {
    if (responseData == null || !responseData.hasResponseBody()) {
        throw new CrawlingAccessException("No response body.");
    }
    final ResultData resultData = new ResultData();
    resultData.setTransformerName(getName());
    try {
        resultData.setData(SerializeUtil.fromObjectToBinary(generateData(responseData)));
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not serialize object", e);
    }
    resultData.setEncoding(fessConfig.getCrawlerCrawlingDataEncoding());
    return resultData;
}
Also used : AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) ResultData(org.codelibs.fess.crawler.entity.ResultData) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Example 2 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class CsvDataStoreImpl method processCsv.

protected void processCsv(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final CsvConfig csvConfig, final File csvFile, final long readInterval, final String csvFileEncoding, final boolean hasHeaderLine) {
    logger.info("Loading " + csvFile.getAbsolutePath());
    CsvReader csvReader = null;
    try {
        csvReader = new CsvReader(new BufferedReader(new InputStreamReader(new FileInputStream(csvFile), csvFileEncoding)), csvConfig);
        List<String> headerList = null;
        if (hasHeaderLine) {
            headerList = csvReader.readValues();
        }
        List<String> list;
        boolean loop = true;
        while ((list = csvReader.readValues()) != null && loop && alive) {
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.putAll(defaultDataMap);
            final Map<String, Object> resultMap = new LinkedHashMap<>();
            resultMap.putAll(paramMap);
            resultMap.put("csvfile", csvFile.getAbsolutePath());
            resultMap.put("csvfilename", csvFile.getName());
            resultMap.put("crawlingConfig", dataConfig);
            boolean foundValues = false;
            for (int i = 0; i < list.size(); i++) {
                String key = null;
                String value = list.get(i);
                if (value == null) {
                    value = StringUtil.EMPTY;
                }
                if (StringUtil.isNotBlank(value)) {
                    foundValues = true;
                }
                if (headerList != null && headerList.size() > i) {
                    key = headerList.get(i);
                    if (StringUtil.isNotBlank(key)) {
                        resultMap.put(key, value);
                    }
                }
                key = CELL_PREFIX + Integer.toString(i + 1);
                resultMap.put(key, value);
            }
            if (!foundValues) {
                logger.debug("No data in line: {}", resultMap);
                continue;
            }
            if (logger.isDebugEnabled()) {
                for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                    logger.debug(entry.getKey() + "=" + entry.getValue());
                }
            }
            final Map<String, Object> crawlingContext = new HashMap<>();
            crawlingContext.put("doc", dataMap);
            resultMap.put("crawlingContext", crawlingContext);
            for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                final Object convertValue = convertValue(entry.getValue(), resultMap);
                if (convertValue != null) {
                    dataMap.put(entry.getKey(), convertValue);
                }
            }
            if (logger.isDebugEnabled()) {
                for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                    logger.debug(entry.getKey() + "=" + entry.getValue());
                }
            }
            try {
                callback.store(paramMap, dataMap);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : " + dataMap, e);
                Throwable target = e;
                if (target instanceof MultipleCrawlingAccessException) {
                    final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }
                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }
                String url;
                if (target instanceof DataStoreCrawlingException) {
                    final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                    url = dce.getUrl();
                    if (dce.aborted()) {
                        loop = false;
                    }
                } else {
                    url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
                }
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, errorName, url, target);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : " + dataMap, t);
                final String url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
            }
            if (readInterval > 0) {
                sleep(readInterval);
            }
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when reading csv file.", e);
    } finally {
        IOUtils.closeQuietly(csvReader);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) FileInputStream(java.io.FileInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) LinkedHashMap(java.util.LinkedHashMap) CsvReader(com.orangesignal.csv.CsvReader) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 3 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class TextTransformer method transform.

/*
     * (non-Javadoc)
     *
     * @see
     * org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.fess.crawler.entity.ResponseData)
     */
@Override
public ResultData transform(final ResponseData responseData) {
    if (responseData == null || !responseData.hasResponseBody()) {
        throw new CrawlingAccessException("No response body.");
    }
    final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
    if (extractorFactory == null) {
        throw new CrawlerSystemException("Could not find extractorFactory.");
    }
    final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
    String content = null;
    try (final InputStream in = responseData.getResponseBody()) {
        content = extractor.getText(in, params).getContent();
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not extract data.", e);
    }
    final ResultData resultData = new ResultData();
    resultData.setTransformerName(getName());
    try {
        resultData.setData(content.getBytes(charsetName));
    } catch (final UnsupportedEncodingException e) {
        if (logger.isInfoEnabled()) {
            logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
        }
        charsetName = Constants.UTF_8_CHARSET.name();
        resultData.setData(content.getBytes(Constants.UTF_8_CHARSET));
    }
    resultData.setEncoding(charsetName);
    return resultData;
}
Also used : ResultData(org.codelibs.fess.crawler.entity.ResultData) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) HashMap(java.util.HashMap) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) InputStream(java.io.InputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Extractor(org.codelibs.fess.crawler.extractor.Extractor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Example 4 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class XmlTransformer method transform.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.codelibs.fess.crawler.entity.ResponseData)
     */
@Override
public ResultData transform(final ResponseData responseData) {
    if (responseData == null || !responseData.hasResponseBody()) {
        throw new CrawlingAccessException("No response body.");
    }
    try (final InputStream is = responseData.getResponseBody()) {
        final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        for (final Map.Entry<String, Object> entry : attributeMap.entrySet()) {
            factory.setAttribute(entry.getKey(), entry.getValue());
        }
        for (final Map.Entry<String, String> entry : featureMap.entrySet()) {
            factory.setFeature(entry.getKey(), "true".equalsIgnoreCase(entry.getValue()));
        }
        factory.setCoalescing(coalescing);
        factory.setExpandEntityReferences(expandEntityRef);
        factory.setIgnoringComments(ignoringComments);
        factory.setIgnoringElementContentWhitespace(ignoringElementContentWhitespace);
        factory.setNamespaceAware(namespaceAware);
        factory.setValidating(validating);
        factory.setXIncludeAware(includeAware);
        final DocumentBuilder builder = factory.newDocumentBuilder();
        final Document doc = builder.parse(is);
        final StringBuilder buf = new StringBuilder(1000);
        buf.append(getResultDataHeader());
        for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
            final List<String> nodeStrList = new ArrayList<>();
            try {
                final NodeList nodeList = getNodeList(doc, entry.getValue());
                for (int i = 0; i < nodeList.getLength(); i++) {
                    final Node node = nodeList.item(i);
                    nodeStrList.add(node.getTextContent());
                }
            } catch (final TransformerException e) {
                logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue(), e);
            }
            if (nodeStrList.size() == 1) {
                buf.append(getResultDataBody(entry.getKey(), nodeStrList.get(0)));
            } else if (nodeStrList.size() > 1) {
                buf.append(getResultDataBody(entry.getKey(), nodeStrList));
            }
        }
        buf.append(getAdditionalData(responseData, doc));
        buf.append(getResultDataFooter());
        final ResultData resultData = new ResultData();
        resultData.setTransformerName(getName());
        final String data = buf.toString().trim();
        try {
            resultData.setData(data.getBytes(charsetName));
        } catch (final UnsupportedEncodingException e) {
            if (logger.isInfoEnabled()) {
                logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
            }
            charsetName = Constants.UTF_8_CHARSET.name();
            resultData.setData(data.getBytes(Constants.UTF_8_CHARSET));
        }
        resultData.setEncoding(charsetName);
        return resultData;
    } catch (final CrawlerSystemException e) {
        throw e;
    } catch (final Exception e) {
        throw new CrawlerSystemException("Could not store data.", e);
    }
}
Also used : DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) InputStream(java.io.InputStream) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Document(org.w3c.dom.Document) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) TransformerException(javax.xml.transform.TransformerException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) ResultData(org.codelibs.fess.crawler.entity.ResultData) DocumentBuilder(javax.xml.parsers.DocumentBuilder) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TransformerException(javax.xml.transform.TransformerException)

Example 5 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class XpathTransformer method storeData.

@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
    final DOMParser parser = getDomParser();
    try (final InputStream in = responseData.getResponseBody()) {
        final InputSource is = new InputSource(in);
        if (responseData.getCharSet() != null) {
            is.setEncoding(responseData.getCharSet());
        }
        parser.parse(is);
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
    }
    final Document document = parser.getDocument();
    final StringBuilder buf = new StringBuilder(1000);
    buf.append(getResultDataHeader());
    for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
        final String path = entry.getValue();
        try {
            final XObject xObj = getXPathAPI().eval(document, path);
            final int type = xObj.getType();
            switch(type) {
                case XObject.CLASS_BOOLEAN:
                    final boolean b = xObj.bool();
                    buf.append(getResultDataBody(entry.getKey(), Boolean.toString(b)));
                    break;
                case XObject.CLASS_NUMBER:
                    final double d = xObj.num();
                    buf.append(getResultDataBody(entry.getKey(), Double.toString(d)));
                    break;
                case XObject.CLASS_STRING:
                    final String str = xObj.str();
                    buf.append(getResultDataBody(entry.getKey(), str.trim()));
                    break;
                case XObject.CLASS_NODESET:
                    final NodeList nodeList = xObj.nodelist();
                    final List<String> strList = new ArrayList<>();
                    for (int i = 0; i < nodeList.getLength(); i++) {
                        final Node node = nodeList.item(i);
                        strList.add(node.getTextContent());
                    }
                    buf.append(getResultDataBody(entry.getKey(), strList));
                    break;
                case XObject.CLASS_RTREEFRAG:
                    final int rtf = xObj.rtf();
                    buf.append(getResultDataBody(entry.getKey(), Integer.toString(rtf)));
                    break;
                case XObject.CLASS_NULL:
                case XObject.CLASS_UNKNOWN:
                case XObject.CLASS_UNRESOLVEDVARIABLE:
                default:
                    Object obj = xObj.object();
                    if (obj == null) {
                        obj = "";
                    }
                    buf.append(getResultDataBody(entry.getKey(), obj.toString()));
                    break;
            }
        } catch (final TransformerException e) {
            logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
        }
    }
    buf.append(getAdditionalData(responseData, document));
    buf.append(getResultDataFooter());
    final String data = buf.toString().trim();
    try {
        resultData.setData(data.getBytes(charsetName));
    } catch (final UnsupportedEncodingException e) {
        if (logger.isInfoEnabled()) {
            logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
        }
        charsetName = Constants.UTF_8_CHARSET.name();
        resultData.setData(data.getBytes(Constants.UTF_8_CHARSET));
    }
    resultData.setEncoding(charsetName);
}
Also used : InputSource(org.xml.sax.InputSource) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) InputStream(java.io.InputStream) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Document(org.w3c.dom.Document) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) TransformerException(javax.xml.transform.TransformerException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) XObject(org.apache.xpath.objects.XObject) DOMParser(org.cyberneko.html.parsers.DOMParser) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) XObject(org.apache.xpath.objects.XObject) TransformerException(javax.xml.transform.TransformerException)

Aggregations

CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)36 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)14 InputStream (java.io.InputStream)13 Map (java.util.Map)9 IOException (java.io.IOException)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 BufferedInputStream (java.io.BufferedInputStream)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 ResultData (org.codelibs.fess.crawler.entity.ResultData)7 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)7 MalformedURLException (java.net.MalformedURLException)6 AccessResultData (org.codelibs.fess.crawler.entity.AccessResultData)6 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 File (java.io.File)5 LinkedHashMap (java.util.LinkedHashMap)5 FileInputStream (java.io.FileInputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 Date (java.util.Date)4