Search in sources :

Example 16 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class SeleniumDownloaderTest method testBaiduWenku.

@Ignore
@Test
public void testBaiduWenku() {
    SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
    seleniumDownloader.setSleepTime(10000);
    long time1 = System.currentTimeMillis();
    Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {

        @Override
        public String getUUID() {
            return "huaban.com";
        }

        @Override
        public Site getSite() {
            return Site.me();
        }
    });
    System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>", "").replace("&nsbp;", "").all());
}
Also used : Site(us.codecraft.webmagic.Site) Task(us.codecraft.webmagic.Task) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 17 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class ZipCodePageProcessor method processProvince.

private void processProvince(Page page) {
    //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
    List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
    Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"", Pattern.DOTALL);
    for (String district : districts) {
        Matcher matcher = pattern.matcher(district);
        while (matcher.find()) {
            String title = matcher.group(1);
            String link = matcher.group(2);
            Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
            page.addTargetRequest(request);
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) Request(us.codecraft.webmagic.Request)

Example 18 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class ZipCodePageProcessor method processCountry.

private void processCountry(Page page) {
    List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
    for (String province : provinces) {
        String link = xpath("//@href").select(province);
        String title = xpath("/text()").select(province);
        Request request = new Request(link).setPriority(0).putExtra("province", title);
        page.addTargetRequest(request);
    }
}
Also used : Request(us.codecraft.webmagic.Request)

Example 19 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class ProcessorBenchmark method test.

@Ignore
@Test
public void test() {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
    Page page = new Page();
    page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
    page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
    page.setHtml(new Html(html));
    long time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
}
Also used : PlainText(us.codecraft.webmagic.selector.PlainText) Request(us.codecraft.webmagic.Request) Html(us.codecraft.webmagic.selector.Html) Page(us.codecraft.webmagic.Page) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 20 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class RedisScheduler method poll.

@Override
public synchronized Request poll(Task task) {
    Jedis jedis = pool.getResource();
    try {
        String url = jedis.lpop(getQueueKey(task));
        if (url == null) {
            return null;
        }
        String key = ITEM_PREFIX + task.getUUID();
        String field = DigestUtils.shaHex(url);
        byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
        if (bytes != null) {
            Request o = JSON.parseObject(new String(bytes), Request.class);
            return o;
        }
        Request request = new Request(url);
        return request;
    } finally {
        pool.returnResource(jedis);
    }
}
Also used : Jedis(redis.clients.jedis.Jedis) Request(us.codecraft.webmagic.Request)

Aggregations

Request (us.codecraft.webmagic.Request)27 Test (org.junit.Test)19 Page (us.codecraft.webmagic.Page)9 Ignore (org.junit.Ignore)8 Task (us.codecraft.webmagic.Task)8 Site (us.codecraft.webmagic.Site)4 DuplicateRemover (us.codecraft.webmagic.scheduler.component.DuplicateRemover)4 HttpServer (com.github.dreamhead.moco.HttpServer)3 Runnable (com.github.dreamhead.moco.Runnable)3 IOException (java.io.IOException)3 PlainText (us.codecraft.webmagic.selector.PlainText)3 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 HashSetDuplicateRemover (us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover)2 Html (us.codecraft.webmagic.selector.Html)2 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)1 RequestBuilder (org.apache.http.client.methods.RequestBuilder)1 CloseableHttpClient (org.apache.http.impl.client.CloseableHttpClient)1 BeforeClass (org.junit.BeforeClass)1