Search in sources :

Example 1 with UrlFilterImpl

use of org.codelibs.fess.crawler.filter.impl.UrlFilterImpl in project fess-crawler by codelibs.

the class CrawlerTest method test_execute_bg.

public void test_execute_bg() throws Exception {
    final CrawlerWebServer server = new CrawlerWebServer(7070);
    server.start();
    try {
        final String url = "http://localhost:7070/";
        final int maxCount = 50;
        final int numOfThread = 10;
        final File file = File.createTempFile("crawler-", "");
        file.delete();
        file.mkdirs();
        file.deleteOnExit();
        fileTransformer.setPath(file.getAbsolutePath());
        crawler.setBackground(true);
        ((UrlFilterImpl) crawler.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler.addUrl(url);
        crawler.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler.getCrawlerContext().setNumOfThread(numOfThread);
        final String sessionId = crawler.execute();
        Thread.sleep(3000);
        assertEquals(CrawlerStatus.RUNNING, crawler.crawlerContext.getStatus());
        crawler.awaitTermination();
        assertEquals(maxCount, dataService.getCount(sessionId));
        dataService.delete(sessionId);
    } finally {
        server.stop();
    }
}
Also used : UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) File(java.io.File)

Example 2 with UrlFilterImpl

use of org.codelibs.fess.crawler.filter.impl.UrlFilterImpl in project fess-crawler by codelibs.

the class CrawlerTest method test_execute_2instanceTx.

public void test_execute_2instanceTx() throws Exception {
    final CrawlerWebServer server1 = new CrawlerWebServer(7070);
    server1.start();
    final CrawlerWebServer server2 = new CrawlerWebServer(7071);
    server2.start();
    final String url1 = "http://localhost:7070/";
    final String url2 = "http://localhost:7071/";
    try {
        final int maxCount = 10;
        final int numOfThread = 10;
        final File file = File.createTempFile("crawler-", "");
        file.delete();
        file.mkdirs();
        file.deleteOnExit();
        fileTransformer.setPath(file.getAbsolutePath());
        final Crawler crawler1 = getComponent(Crawler.class);
        crawler1.setBackground(true);
        ((UrlFilterImpl) crawler1.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler1.addUrl(url1);
        crawler1.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler1.getCrawlerContext().setNumOfThread(numOfThread);
        Thread.sleep(100);
        final Crawler crawler2 = getComponent(Crawler.class);
        crawler2.setBackground(true);
        ((UrlFilterImpl) crawler2.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler2.addUrl(url2);
        crawler2.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler2.getCrawlerContext().setNumOfThread(numOfThread);
        final String sessionId1 = crawler1.execute();
        final String sessionId2 = crawler2.execute();
        assertNotSame(sessionId1, sessionId2);
        assertNotSame(crawler1.crawlerContext, crawler2.crawlerContext);
        for (int i = 0; i < 10; i++) {
            if (crawler1.crawlerContext.getStatus() == CrawlerStatus.RUNNING) {
                break;
            }
            Thread.sleep(500);
        }
        assertEquals(CrawlerStatus.RUNNING, crawler1.crawlerContext.getStatus());
        for (int i = 0; i < 10; i++) {
            if (crawler2.crawlerContext.getStatus() == CrawlerStatus.RUNNING) {
                break;
            }
            Thread.sleep(500);
        }
        assertEquals(CrawlerStatus.RUNNING, crawler2.crawlerContext.getStatus());
        crawler1.awaitTermination();
        crawler2.awaitTermination();
        assertEquals(maxCount, dataService.getCount(sessionId1));
        assertEquals(maxCount, dataService.getCount(sessionId2));
        UrlQueue urlQueue;
        while ((urlQueue = urlQueueService.poll(sessionId1)) != null) {
            assertTrue(urlQueue.getUrl() + "=>" + url1, urlQueue.getUrl().startsWith(url1));
        }
        while ((urlQueue = urlQueueService.poll(sessionId2)) != null) {
            assertTrue(urlQueue.getUrl() + "=>" + url2, urlQueue.getUrl().startsWith(url2));
        }
        dataService.iterate(sessionId1, accessResult -> assertTrue(accessResult.getUrl().startsWith(url1)));
        dataService.iterate(sessionId2, accessResult -> assertTrue(accessResult.getUrl().startsWith(url2)));
        dataService.delete(sessionId1);
        dataService.delete(sessionId2);
    } finally {
        try {
            server1.stop();
        } finally {
            server2.stop();
        }
    }
}
Also used : UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) Crawler(org.codelibs.fess.crawler.Crawler) File(java.io.File)

Example 3 with UrlFilterImpl

use of org.codelibs.fess.crawler.filter.impl.UrlFilterImpl in project fess-crawler by codelibs.

the class CrawlerTest method test_execute_2instance.

public void test_execute_2instance() throws Exception {
    final CrawlerWebServer server1 = new CrawlerWebServer(7070);
    server1.start();
    final CrawlerWebServer server2 = new CrawlerWebServer(7071);
    server2.start();
    final String url1 = "http://localhost:7070/";
    final String url2 = "http://localhost:7071/";
    try {
        final int maxCount = 10;
        final int numOfThread = 10;
        final File file = File.createTempFile("crawler-", "");
        file.delete();
        file.mkdirs();
        file.deleteOnExit();
        fileTransformer.setPath(file.getAbsolutePath());
        final Crawler crawler1 = container.getComponent("crawler");
        crawler1.setSessionId(crawler1.getSessionId() + "1");
        crawler1.setBackground(true);
        ((UrlFilterImpl) crawler1.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler1.addUrl(url1);
        crawler1.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler1.getCrawlerContext().setNumOfThread(numOfThread);
        final Crawler crawler2 = container.getComponent("crawler");
        crawler2.setSessionId(crawler2.getSessionId() + "2");
        crawler2.setBackground(true);
        ((UrlFilterImpl) crawler2.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler2.addUrl(url2);
        crawler2.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler2.getCrawlerContext().setNumOfThread(numOfThread);
        final String sessionId1 = crawler1.execute();
        final String sessionId2 = crawler2.execute();
        assertNotSame(sessionId1, sessionId2);
        assertNotSame(crawler1.crawlerContext, crawler2.crawlerContext);
        Thread.sleep(1000);
        assertEquals(CrawlerStatus.RUNNING, crawler1.crawlerContext.getStatus());
        assertEquals(CrawlerStatus.RUNNING, crawler2.crawlerContext.getStatus());
        crawler1.awaitTermination();
        crawler2.awaitTermination();
        assertEquals(maxCount, dataService.getCount(sessionId1));
        assertEquals(maxCount, dataService.getCount(sessionId2));
        UrlQueue urlQueue;
        while ((urlQueue = urlQueueService.poll(sessionId1)) != null) {
            assertTrue(urlQueue.getUrl().startsWith(url1));
        }
        while ((urlQueue = urlQueueService.poll(sessionId2)) != null) {
            assertTrue(urlQueue.getUrl().startsWith(url2));
        }
        dataService.iterate(sessionId1, accessResult -> {
            assertTrue(accessResult.getUrl().startsWith(url1));
            assertEquals(Constants.GET_METHOD, accessResult.getMethod());
        });
        dataService.iterate(sessionId2, accessResult -> {
            assertTrue(accessResult.getUrl().startsWith(url2));
            assertEquals(Constants.GET_METHOD, accessResult.getMethod());
        });
        dataService.delete(sessionId1);
        dataService.delete(sessionId2);
    } finally {
        try {
            server1.stop();
        } finally {
            server2.stop();
        }
    }
}
Also used : UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) File(java.io.File)

Aggregations

File (java.io.File)3 UrlFilterImpl (org.codelibs.fess.crawler.filter.impl.UrlFilterImpl)3 CrawlerWebServer (org.codelibs.fess.crawler.util.CrawlerWebServer)3 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)2 Crawler (org.codelibs.fess.crawler.Crawler)1