Search in sources :

Example 26 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class HtmlTransformer method transform.

@Override
public ResultData transform(final ResponseData responseData) {
    if (responseData == null || !responseData.hasResponseBody()) {
        throw new CrawlingAccessException("No response body.");
    }
    // encoding
    updateCharset(responseData);
    final ResultData resultData = new ResultData();
    resultData.setTransformerName(getName());
    try {
        // data
        storeData(responseData, resultData);
        if (isHtml(responseData) && !responseData.isNoFollow()) {
            // urls
            storeChildUrls(responseData, resultData);
        }
    } finally {
        xpathAPI.remove();
    }
    final Object redirectUrlObj = responseData.getMetaDataMap().get(LOCATION_HEADER);
    if (redirectUrlObj instanceof String) {
        final UrlConvertHelper urlConvertHelper = crawlerContainer.getComponent("urlConvertHelper");
        resultData.addUrl(RequestDataBuilder.newRequestData().get().url(urlConvertHelper.convert(redirectUrlObj.toString())).build());
    }
    return resultData;
}
Also used : AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) ResultData(org.codelibs.fess.crawler.entity.ResultData) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) UrlConvertHelper(org.codelibs.fess.crawler.helper.UrlConvertHelper)

Example 27 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class SmbClientTest method test_doGet_accessTimeoutTarget.

public void test_doGet_accessTimeoutTarget() {
    SmbClient client = new SmbClient() {

        @Override
        protected ResponseData getResponseData(final String uri, final boolean includeContent) {
            try {
                Thread.sleep(10000);
            } catch (InterruptedException e) {
                throw new CrawlingAccessException(e);
            }
            return null;
        }
    };
    client.setAccessTimeout(1);
    try {
        client.doGet("smb://localhost/test.txt");
        fail();
    } catch (CrawlingAccessException e) {
        assertTrue(e.getCause() instanceof InterruptedException);
    }
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException)

Example 28 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class FileSystemClientTest method test_doGet_accessTimeoutTarget.

public void test_doGet_accessTimeoutTarget() {
    FileSystemClient client = new FileSystemClient() {

        @Override
        protected ResponseData getResponseData(final String uri, final boolean includeContent) {
            try {
                Thread.sleep(10000);
            } catch (InterruptedException e) {
                throw new CrawlingAccessException(e);
            }
            return null;
        }
    };
    client.setAccessTimeout(1);
    try {
        client.doGet("file:/tmp/test.txt");
        fail();
    } catch (CrawlingAccessException e) {
        assertTrue(e.getCause() instanceof InterruptedException);
    }
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException)

Example 29 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class FileSystemClientTest method test_doHead_accessTimeoutTarget.

public void test_doHead_accessTimeoutTarget() {
    FileSystemClient client = new FileSystemClient() {

        @Override
        protected ResponseData getResponseData(final String uri, final boolean includeContent) {
            try {
                Thread.sleep(10000);
            } catch (InterruptedException e) {
                throw new CrawlingAccessException(e);
            }
            return null;
        }
    };
    client.setAccessTimeout(1);
    try {
        client.doHead("file:/tmp/test.txt");
        fail();
    } catch (CrawlingAccessException e) {
        assertTrue(e.getCause() instanceof InterruptedException);
    }
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException)

Example 30 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class FtpClientTest method test_ftpInfo.

public void test_ftpInfo() {
    String value;
    FtpInfo ftpInfo;
    try {
        ftpInfo = new FtpClient.FtpInfo(null);
        fail();
    } catch (CrawlingAccessException e) {
    // ignore
    }
    try {
        ftpInfo = new FtpClient.FtpInfo("");
        fail();
    } catch (CrawlingAccessException e) {
    // ignore
    }
    try {
        ftpInfo = new FtpClient.FtpInfo("abc");
        fail();
    } catch (CrawlingAccessException e) {
    // ignore
    }
    value = "ftp://123.123.123.123:9999/";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals(value, ftpInfo.toUrl());
    assertEquals("123.123.123.123:9999", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(9999, ftpInfo.getPort());
    assertEquals("/", ftpInfo.getParent());
    assertNull(ftpInfo.getName());
    value = "ftp://123.123.123.123/test.txt";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals(value, ftpInfo.toUrl());
    assertEquals("123.123.123.123:21", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(21, ftpInfo.getPort());
    assertEquals("/", ftpInfo.getParent());
    assertEquals("test.txt", ftpInfo.getName());
    value = "ftp://123.123.123.123/aaa/../test.txt";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals("ftp://123.123.123.123/test.txt", ftpInfo.toUrl());
    assertEquals("123.123.123.123:21", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(21, ftpInfo.getPort());
    assertEquals("/", ftpInfo.getParent());
    assertEquals("test.txt", ftpInfo.getName());
    assertEquals("ftp://123.123.123.123/", ftpInfo.toUrl("/"));
    value = "ftp://123.123.123.123:21/test1/test.txt";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals("ftp://123.123.123.123/test1/test.txt", ftpInfo.toUrl());
    assertEquals("123.123.123.123:21", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(21, ftpInfo.getPort());
    assertEquals("/test1", ftpInfo.getParent());
    assertEquals("test.txt", ftpInfo.getName());
    assertEquals("ftp://123.123.123.123/", ftpInfo.toUrl("/"));
    assertEquals("ftp://123.123.123.123/aaa/bbb/ccc.txt", ftpInfo.toUrl("/aaa//bbb/ccc.txt"));
    assertEquals("ftp://123.123.123.123/ccc.txt", ftpInfo.toUrl("/aaa/../ccc.txt"));
    value = "ftp://123.123.123.123/test test.txt";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals(value, ftpInfo.toUrl());
    assertEquals("123.123.123.123:21", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(21, ftpInfo.getPort());
    assertEquals("/", ftpInfo.getParent());
    assertEquals("test test.txt", ftpInfo.getName());
    value = "ftp://123.123.123.123/テスト.txt";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals(value, ftpInfo.toUrl());
    assertEquals("123.123.123.123:21", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(21, ftpInfo.getPort());
    assertEquals("/", ftpInfo.getParent());
    assertEquals("テスト.txt", ftpInfo.getName());
    value = "ftp://123.123.123.123/";
    ftpInfo = new FtpClient.FtpInfo(value);
    assertEquals(value, ftpInfo.toUrl());
    assertEquals("123.123.123.123:21", ftpInfo.getCacheKey());
    assertEquals("123.123.123.123", ftpInfo.getHost());
    assertEquals(21, ftpInfo.getPort());
    assertEquals("/", ftpInfo.getParent());
    assertNull(ftpInfo.getName());
}
Also used : FtpInfo(org.codelibs.fess.crawler.client.ftp.FtpClient.FtpInfo) FtpInfo(org.codelibs.fess.crawler.client.ftp.FtpClient.FtpInfo) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException)

Aggregations

CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)36 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)14 InputStream (java.io.InputStream)13 Map (java.util.Map)9 IOException (java.io.IOException)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 BufferedInputStream (java.io.BufferedInputStream)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 ResultData (org.codelibs.fess.crawler.entity.ResultData)7 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)7 MalformedURLException (java.net.MalformedURLException)6 AccessResultData (org.codelibs.fess.crawler.entity.AccessResultData)6 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 File (java.io.File)5 LinkedHashMap (java.util.LinkedHashMap)5 FileInputStream (java.io.FileInputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 Date (java.util.Date)4