Search in sources :

Example 21 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class HttpClientDownloaderTest method test_download_binary_content.

@Test
public void test_download_binary_content() throws Exception {
    HttpServer server = httpServer(13423);
    server.response("binary");
    Runner.running(server, new Runnable() {

        @Override
        public void run() throws Exception {
            final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setBinaryContent(true);
            request.setUrl("http://127.0.0.1:13423/");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isNull();
            assertThat(page.getBytes()).isEqualTo("binary".getBytes());
        }
    });
}
Also used : Runnable(com.github.dreamhead.moco.Runnable) HttpServer(com.github.dreamhead.moco.HttpServer) HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) IOException(java.io.IOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Test(org.junit.Test)

Example 22 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class HttpClientDownloaderTest method test_download_fail.

@Test
public void test_download_fail() {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask();
    Request request = new Request(PAGE_ALWAYS_NOT_EXISTS);
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.isDownloadSuccess()).isFalse();
}
Also used : Task(us.codecraft.webmagic.Task) HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) Test(org.junit.Test)

Example 23 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class PhantomJSDownloader method download.

@Override
public Page download(Request request, Task task) {
    if (logger.isInfoEnabled()) {
        logger.info("downloading page: " + request.getUrl());
    }
    String content = getPage(request);
    if (content.contains("HTTP request failed")) {
        for (int i = 1; i <= getRetryNum(); i++) {
            content = getPage(request);
            if (!content.contains("HTTP request failed")) {
                break;
            }
        }
        if (content.contains("HTTP request failed")) {
            // when failed
            Page page = new Page();
            page.setRequest(request);
            return page;
        }
    }
    Page page = new Page();
    page.setRawText(content);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(200);
    return page;
}
Also used : PlainText(us.codecraft.webmagic.selector.PlainText) Page(us.codecraft.webmagic.Page)

Example 24 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class SeleniumDownloader method download.

@Override
public Page download(Request request, Task task) {
    checkInit();
    WebDriver webDriver;
    try {
        webDriver = webDriverPool.get();
    } catch (InterruptedException e) {
        logger.warn("interrupted", e);
        return null;
    }
    logger.info("downloading page " + request.getUrl());
    webDriver.get(request.getUrl());
    try {
        Thread.sleep(sleepTime);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
    WebDriver.Options manage = webDriver.manage();
    Site site = task.getSite();
    if (site.getCookies() != null) {
        for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
            Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
            manage.addCookie(cookie);
        }
    }
    /*
		 * TODO You can add mouse event or other processes
		 * 
		 * @author: bob.li.0718@gmail.com
		 */
    WebElement webElement = webDriver.findElement(By.xpath("/html"));
    String content = webElement.getAttribute("outerHTML");
    Page page = new Page();
    page.setRawText(content);
    page.setHtml(new Html(content, request.getUrl()));
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    webDriverPool.returnToPool(webDriver);
    return page;
}
Also used : WebDriver(org.openqa.selenium.WebDriver) Site(us.codecraft.webmagic.Site) Cookie(org.openqa.selenium.Cookie) PlainText(us.codecraft.webmagic.selector.PlainText) Html(us.codecraft.webmagic.selector.Html) Page(us.codecraft.webmagic.Page) WebElement(org.openqa.selenium.WebElement) Map(java.util.Map)

Example 25 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class ProcessorBenchmark method test.

@Ignore
@Test
public void test() {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
    Page page = new Page();
    page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
    page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
    page.setHtml(new Html(html));
    long time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
}
Also used : PlainText(us.codecraft.webmagic.selector.PlainText) Request(us.codecraft.webmagic.Request) Html(us.codecraft.webmagic.selector.Html) Page(us.codecraft.webmagic.Page) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Page (us.codecraft.webmagic.Page)29 Request (us.codecraft.webmagic.Request)22 Test (org.junit.Test)19 IOException (java.io.IOException)11 HttpUriRequest (org.apache.http.client.methods.HttpUriRequest)11 HttpServer (com.github.dreamhead.moco.HttpServer)10 Runnable (com.github.dreamhead.moco.Runnable)10 UnsupportedEncodingException (java.io.UnsupportedEncodingException)10 PlainText (us.codecraft.webmagic.selector.PlainText)8 Site (us.codecraft.webmagic.Site)5 Task (us.codecraft.webmagic.Task)5 Ignore (org.junit.Ignore)3 Proxy (us.codecraft.webmagic.proxy.Proxy)2 Html (us.codecraft.webmagic.selector.Html)2 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)1 CloseableHttpClient (org.apache.http.impl.client.CloseableHttpClient)1 Cookie (org.openqa.selenium.Cookie)1 WebDriver (org.openqa.selenium.WebDriver)1