Search in sources :

Example 61 with Page

use of org.asqatasun.entity.subject.Page in project Asqatasun by Asqatasun.

the class CrawlerImpl method saveWebResourceFromFetchedPage.

/**
     * 
     * @param curi
     * @param charset
     * @param fetchStatus
     * @param sourceCode
     * @param successfullFetch
     * @return
     */
private SSP saveWebResourceFromFetchedPage(String uri, String charset, int fetchStatus, String sourceCode, boolean successfullFetch) {
    Page page;
    if (mainWebResource instanceof Page) {
        if (!isPageAlreadyFetched) {
            page = (Page) mainWebResource;
            // in case of redirection, we modify the URI of the webresource
            // to ensure the webresource and its SSP have the same URI.
            page.setURL(uri);
            if (successfullFetch) {
                isPageAlreadyFetched = true;
                SSP ssp = createSSPFromPage(uri, charset, page, sourceCode);
                if (persistOnTheFly) {
                    persistSSP(ssp, uri, fetchStatus, page);
                }
                return ssp;
            } else {
                return lastFetchedSSP;
            }
        } else {
            // we don't create SSP anymore. 
            return lastFetchedSSP;
        }
    } else {
        page = webResourceDataService.createPage(uri);
        page.setParent((Site) mainWebResource);
        page.setRank(pageRankCounter);
        pageRankCounter++;
        SSP ssp = createSSPFromPage(uri, charset, page, sourceCode);
        if (persistOnTheFly) {
            persistSSP(ssp, uri, fetchStatus, page);
        }
        return ssp;
    }
}
Also used : Page(org.asqatasun.entity.subject.Page)

Example 62 with Page

use of org.asqatasun.entity.subject.Page in project Asqatasun by Asqatasun.

the class PageFactoryImpl method create.

@Override
public Page create(String url) {
    Page page = new PageImpl();
    page.setURL(url);
    return page;
}
Also used : PageImpl(org.asqatasun.entity.subject.PageImpl) Page(org.asqatasun.entity.subject.Page)

Example 63 with Page

use of org.asqatasun.entity.subject.Page in project Asqatasun by Asqatasun.

the class ContentDAOImpl method findNumberOfOrphanContentFromWebResource.

@Override
public Long findNumberOfOrphanContentFromWebResource(WebResource webResource) {
    if (webResource instanceof Page) {
        Query query = entityManager.createQuery(SELECT_DISTINCT_SSP + SSPImpl.class.getName() + " s" + JOIN_WR + WEB_RESOURCE_CONDITION + HTTP_STATUS_CONDITION);
        query.setParameter(WEB_RESOURCE_KEY, webResource);
        query.setParameter(HTTP_STATUS_CODE_KEY, DEFAULT_HTTP_STATUS_VALUE);
        return (Long) query.getSingleResult();
    } else if (webResource instanceof Site) {
        Query query = entityManager.createQuery(SELECT_DISTINCT_SSP + SSPImpl.class.getName() + " s" + JOIN_WR + " JOIN w.parent p" + " WHERE p=:webResource" + HTTP_STATUS_CONDITION);
        query.setParameter(WEB_RESOURCE_KEY, webResource);
        query.setParameter(HTTP_STATUS_CODE_KEY, DEFAULT_HTTP_STATUS_VALUE);
        return (Long) query.getSingleResult();
    }
    return (long) 0;
}
Also used : Site(org.asqatasun.entity.subject.Site) Query(javax.persistence.Query) Page(org.asqatasun.entity.subject.Page)

Example 64 with Page

use of org.asqatasun.entity.subject.Page in project Asqatasun by Asqatasun.

the class ContentDAOImpl method findOrphanRelatedContentList.

@Override
public List<Content> findOrphanRelatedContentList(WebResource webResource, int start, int chunkSize) {
    if (webResource instanceof Page) {
        Query query = entityManager.createQuery("SELECT distinct rc FROM " + RelatedContentImpl.class.getName() + RELATED_CONTENT_KEY + JOIN_PARENT_CONTENT_SET + JOIN_WR + WEB_RESOURCE_CONDITION + " AND rc.httpStatusCode =:httpStatusCode");
        query.setParameter(WEB_RESOURCE_KEY, webResource);
        query.setParameter(HTTP_STATUS_CODE_KEY, DEFAULT_HTTP_STATUS_VALUE);
        query.setFirstResult(start);
        query.setMaxResults(chunkSize);
        return (List<Content>) query.getResultList();
    } else if (webResource instanceof Site) {
        Query query = entityManager.createQuery("SELECT distinct rc FROM " + RelatedContentImpl.class.getName() + RELATED_CONTENT_KEY + JOIN_PARENT_CONTENT_SET + JOIN_WR + " WHERE w.parent.id=:idWebResource" + " AND rc.httpStatusCode =:httpStatusCode");
        query.setParameter(ID_WEB_RESOURCE_KEY, webResource.getId());
        query.setParameter(HTTP_STATUS_CODE_KEY, DEFAULT_HTTP_STATUS_VALUE);
        query.setFirstResult(start);
        query.setMaxResults(chunkSize);
        List<Content> contentList = (List<Content>) query.getResultList();
        flushAndCloseEntityManager();
        return contentList;
    }
    return Collections.emptyList();
}
Also used : Site(org.asqatasun.entity.subject.Site) Query(javax.persistence.Query) Page(org.asqatasun.entity.subject.Page)

Example 65 with Page

use of org.asqatasun.entity.subject.Page in project Asqatasun by Asqatasun.

the class ScenarioLoaderImplTest method testRun.

/**
     * For this test, we use the pages hosted at http://site.asqatasun.ovh/.
     * The sequence is : 
     *      - get the page "http://site.asqatasun.ovh/."
     *      - click on "This page won't be crawled due to the robots.txt restrictrion" link 
     *      - get the page "http://site.asqatasun.ovh/page-access-forbidden-for-robots.html"
     *      - click on "Back" button
     *      - get the page "http://site.asqatasun.ovh/."
     *      - click on "This page will be crawled" link  
     *      - get the page "http://site.asqatasun.ovh/page-1.html"
     *      - click on "This page won't be crawled due to the robots.txt restrictrion" link 
     *      - get the page "http://site.asqatasun.ovh/page-access-forbidden-for-robots.html"
     *      - click on "Back" button
     *      - get the page "http://site.asqatasun.ovh/page-1.html"
     *      - click on "This page will be crawled" link
     *      - get the page "http://site.asqatasun.ovh/page-2.html"
     *      - click on "This page won't be crawled due to the robots.txt restrictrion" link 
     *      - get the page "http://site.asqatasun.ovh/page-access-forbidden-for-robots.html"
     * 
     * For each retrieved page, a webResource is created and added to the 
     * parent WebResource and a SSP is created that handles the source code of 
     * the page.
     * 
     * In this test, we don't mock the HarFileContentLoaderFactory and the ProfileFactory
     * 
     * This test is a Functionnal test
     * 
     * Test of run method, of class ScenarioLoaderImpl.
     */
public void testRun() {
    System.out.println("run");
    Date date = new Date();
    DateFactory mockDateFactory = createMock(DateFactory.class);
    expect(mockDateFactory.createDate()).andReturn(date).times(8);
    ContentDataService mockContentDataService = createMock(ContentDataService.class);
    WebResourceDataService mockWebResourceDataService = createMock(WebResourceDataService.class);
    Page mockPage1 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(ROOT_PAGE_URL)).andReturn(mockPage1).once();
    //        expect(mockPage1.getURL())
    //                .andReturn(ROOT_PAGE_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage1)).andReturn(mockPage1).once();
    Page mockPage2 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(PAGE_ACCESS_FORBIDDEN_URL)).andReturn(mockPage2).once();
    //        expect(mockPage2.getURL())
    //                .andReturn(PAGE_ACCESS_FORBIDDEN_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage2)).andReturn(mockPage2).once();
    Page mockPage3 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(ROOT_PAGE_URL)).andReturn(mockPage3).once();
    //        expect(mockPage3.getURL())
    //                .andReturn(ROOT_PAGE_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage3)).andReturn(mockPage3).once();
    Page mockPage4 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(PAGE_1_URL)).andReturn(mockPage4).once();
    //        expect(mockPage4.getURL())
    //                .andReturn(PAGE_1_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage4)).andReturn(mockPage4).once();
    Page mockPage5 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(PAGE_ACCESS_FORBIDDEN_URL)).andReturn(mockPage5).once();
    //        expect(mockPage5.getURL())
    //                .andReturn(PAGE_ACCESS_FORBIDDEN_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage5)).andReturn(mockPage5).once();
    Page mockPage6 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(PAGE_1_URL)).andReturn(mockPage6).once();
    //        expect(mockPage6.getURL())
    //                .andReturn(PAGE_1_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage6)).andReturn(mockPage6).once();
    Page mockPage7 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(PAGE_2_URL)).andReturn(mockPage7).once();
    //        expect(mockPage7.getURL())
    //                .andReturn(PAGE_2_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage7)).andReturn(mockPage7).once();
    Page mockPage8 = createMock(Page.class);
    expect(mockWebResourceDataService.createPage(PAGE_ACCESS_FORBIDDEN_URL)).andReturn(mockPage8).once();
    //        expect(mockPage8.getURL())
    //                .andReturn(PAGE_ACCESS_FORBIDDEN_URL)
    //                .once();
    expect(mockWebResourceDataService.saveOrUpdate(mockPage8)).andReturn(mockPage8).once();
    SSP ssp1 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, ROOT_PAGE_URL, pageMap.get(ROOT_PAGE_URL), null, HttpStatus.SC_OK)).andReturn(ssp1).once();
    expect(ssp1.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp1.getURI()).andReturn(ROOT_PAGE_URL).once();
    ssp1.setDOM(readFile("htmlFiles/root-page_1.html", false));
    ssp1.setPage(mockPage1);
    expect(mockContentDataService.saveOrUpdate(ssp1)).andReturn(ssp1).once();
    SSP ssp2 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, PAGE_ACCESS_FORBIDDEN_URL, pageMap.get(PAGE_ACCESS_FORBIDDEN_URL), null, HttpStatus.SC_OK)).andReturn(ssp2).once();
    expect(ssp2.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp2.getURI()).andReturn(PAGE_ACCESS_FORBIDDEN_URL).once();
    ssp2.setDOM(readFile("htmlFiles/page-access-forbidden_1.html", false));
    ssp2.setPage(mockPage2);
    expect(mockContentDataService.saveOrUpdate(ssp2)).andReturn(ssp2).once();
    SSP ssp3 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, ROOT_PAGE_URL, pageMap.get(ROOT_PAGE_URL), null, HttpStatus.SC_OK)).andReturn(ssp3).once();
    expect(ssp3.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp3.getURI()).andReturn(ROOT_PAGE_URL).once();
    ssp3.setDOM(readFile("htmlFiles/root-page_1.html", false));
    ssp3.setPage(mockPage3);
    expect(mockContentDataService.saveOrUpdate(ssp3)).andReturn(ssp3).once();
    SSP ssp4 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, PAGE_1_URL, pageMap.get(PAGE_1_URL), null, HttpStatus.SC_OK)).andReturn(ssp4).once();
    expect(ssp4.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp4.getURI()).andReturn(PAGE_1_URL).once();
    ssp4.setDOM(readFile("htmlFiles/page-1_1.html", false));
    ssp4.setPage(mockPage4);
    expect(mockContentDataService.saveOrUpdate(ssp4)).andReturn(ssp4).once();
    SSP ssp5 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, PAGE_ACCESS_FORBIDDEN_URL, pageMap.get(PAGE_ACCESS_FORBIDDEN_URL), null, HttpStatus.SC_OK)).andReturn(ssp5).once();
    expect(ssp5.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp5.getURI()).andReturn(PAGE_ACCESS_FORBIDDEN_URL).once();
    ssp5.setDOM(readFile("htmlFiles/page-access-forbidden_1.html", false));
    ssp5.setPage(mockPage5);
    expect(mockContentDataService.saveOrUpdate(ssp5)).andReturn(ssp5).once();
    SSP ssp6 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, PAGE_1_URL, pageMap.get(PAGE_1_URL), null, HttpStatus.SC_OK)).andReturn(ssp6).once();
    expect(ssp6.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp6.getURI()).andReturn(PAGE_1_URL).once();
    ssp6.setDOM(readFile("htmlFiles/page-1_1.html", false));
    ssp6.setPage(mockPage6);
    expect(mockContentDataService.saveOrUpdate(ssp6)).andReturn(ssp6).once();
    SSP ssp7 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, PAGE_2_URL, pageMap.get(PAGE_2_URL), null, HttpStatus.SC_OK)).andReturn(ssp7).once();
    expect(ssp7.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp7.getURI()).andReturn(PAGE_2_URL).once();
    ssp7.setDOM(readFile("htmlFiles/page-2_1.html", false));
    ssp7.setPage(mockPage7);
    expect(mockContentDataService.saveOrUpdate(ssp7)).andReturn(ssp7).once();
    SSP ssp8 = createMock(SSP.class);
    expect(mockContentDataService.getSSP(date, PAGE_ACCESS_FORBIDDEN_URL, pageMap.get(PAGE_ACCESS_FORBIDDEN_URL), null, HttpStatus.SC_OK)).andReturn(ssp8).once();
    expect(ssp8.getHttpStatusCode()).andReturn(HttpStatus.SC_OK).once();
    expect(ssp8.getURI()).andReturn(PAGE_ACCESS_FORBIDDEN_URL).once();
    ssp8.setDOM(readFile("htmlFiles/page-access-forbidden_1.html", false));
    ssp8.setPage(mockPage8);
    expect(mockContentDataService.saveOrUpdate(ssp8)).andReturn(ssp8).once();
    Site mockSite = createMock(Site.class);
    //        expect(mockSite.getLabel())
    //                .andReturn("My Test Label")
    //                .once();
    mockSite.addChild(mockPage1);
    expectLastCall().once();
    mockSite.addChild(mockPage2);
    expectLastCall().once();
    mockSite.addChild(mockPage3);
    expectLastCall().once();
    mockSite.addChild(mockPage4);
    expectLastCall().once();
    mockSite.addChild(mockPage5);
    expectLastCall().once();
    mockSite.addChild(mockPage6);
    expectLastCall().once();
    mockSite.addChild(mockPage7);
    expectLastCall().once();
    mockSite.addChild(mockPage8);
    expectLastCall().once();
    expect(mockSite.getURL()).andReturn(ROOT_PAGE_URL).once();
    //        expect(mockWebResourceDataService.saveOrUpdate(mockSite))
    //                .andReturn(mockSite)
    //                .times(8);
    replay(mockPage1);
    replay(mockPage2);
    replay(mockPage3);
    replay(mockPage4);
    replay(mockPage5);
    replay(mockPage6);
    replay(mockPage7);
    replay(mockPage8);
    replay(mockSite);
    replay(mockWebResourceDataService);
    replay(mockDateFactory);
    replay(mockContentDataService);
    replay(ssp1);
    replay(ssp2);
    replay(ssp3);
    replay(ssp4);
    replay(ssp5);
    replay(ssp6);
    replay(ssp7);
    replay(ssp8);
    //        ScenarioLoaderImpl instance = new ScenarioLoaderImpl(
    //                mockSite,
    //                readFile("MyTest.json", true), 
    //                harFileContentLoaderFactory);
    //        instance.setContentDataService(mockContentDataService);
    //        instance.setContentFactory(mockContentFactory);
    //        instance.setDateFactory(mockDateFactory);
    //        instance.setWebResourceDataService(mockWebResourceDataService);
    //        
    //        instance.run(); 
    verify(mockPage1);
    verify(mockPage2);
    verify(mockPage3);
    verify(mockPage4);
    verify(mockPage5);
    verify(mockPage6);
    verify(mockPage7);
    verify(mockPage8);
    verify(mockSite);
    verify(mockWebResourceDataService);
    verify(mockDateFactory);
    verify(mockContentDataService);
    verify(ssp1);
    verify(ssp2);
    verify(ssp3);
    verify(ssp4);
    verify(ssp5);
    verify(ssp6);
    verify(ssp7);
    verify(ssp8);
}
Also used : Site(org.asqatasun.entity.subject.Site) SSP(org.asqatasun.entity.audit.SSP) WebResourceDataService(org.asqatasun.entity.service.subject.WebResourceDataService) Page(org.asqatasun.entity.subject.Page) DateFactory(org.asqatasun.util.factory.DateFactory) ContentDataService(org.asqatasun.entity.service.audit.ContentDataService)

Aggregations

Page (org.asqatasun.entity.subject.Page)77 Site (org.asqatasun.entity.subject.Site)68 WebResource (org.asqatasun.entity.subject.WebResource)7 ArrayList (java.util.ArrayList)6 Query (javax.persistence.Query)6 Audit (org.asqatasun.entity.audit.Audit)4 ProcessResult (org.asqatasun.entity.audit.ProcessResult)4 List (java.util.List)3 Map (java.util.Map)3 SSP (org.asqatasun.entity.audit.SSP)3 Content (org.asqatasun.entity.audit.Content)2 DefiniteResult (org.asqatasun.entity.audit.DefiniteResult)2 ForbiddenPageException (org.asqatasun.webapp.exception.ForbiddenPageException)2 ColumnBuilderException (ar.com.fdvs.dj.domain.builders.ColumnBuilderException)1 IOException (java.io.IOException)1 JRException (net.sf.jasperreports.engine.JRException)1 PreProcessResult (org.asqatasun.entity.audit.PreProcessResult)1 Scope (org.asqatasun.entity.reference.Scope)1 ContentDataService (org.asqatasun.entity.service.audit.ContentDataService)1 WebResourceDataService (org.asqatasun.entity.service.subject.WebResourceDataService)1