Examples with URLCrawlEntry - edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry

Example 1 with URLCrawlEntry

use of edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry in project cogtool by cogtool.

the class ProjectController method createImportWebCrawlAction.

protected IListenerAction createImportWebCrawlAction() {
    return new IListenerAction() {

        public Class<?> getParameterClass() {
            return DesignSelectionState.class;
        }

        public boolean performAction(Object prms) {
            DesignSelectionState selection = (DesignSelectionState) prms;
            ProjectInteraction.IWebCrawlImport importParms = interaction.requestWebCrawlParms(project, WebCrawler.DEFAULT_MAX_TO_CRAWL);
            if (importParms != null) {
                String designName = importParms.getDesignName();
                Design design;
                if (designName == null) {
                    ProjectInteraction.DesignRequestData requestData = requestNewDesignParms(false);
                    if (requestData == null) {
                        // canceled!
                        return false;
                    }
                    design = new Design(requestData.designName, requestData.deviceTypes);
                } else {
                    design = project.getDesign(designName);
                }
                int defaultDepth = importParms.getDefaultDepth();
                List<URLCrawlEntry> urls = importParms.getURLsToCrawl();
                if ((urls == null) || (urls.size() == 0)) {
                    interaction.reportProblem(ImportWebCrawlThread.IMPORT_WEB_CRAWL, noURLsToCrawlError);
                    return false;
                }
                // If new, indicate that the work thread should add the
                // new design to the project when it completes;
                // -1 indicates that the design is *not* new.
                int beforeIndex = ImportWebCrawlThread.EXISTING_DESIGN;
                if (designName == null) {
                    Design selectedDesign = selection.getSelectedDesign();
                    beforeIndex = (selectedDesign != null) ? (project.getDesigns().indexOf(selectedDesign) + 1) : project.getDesigns().size();
                }
                ImportWebCrawlThread workThread = new ImportWebCrawlThread(interaction, undoMgr, project, design, beforeIndex, importParms.getMaxPages(), defaultDepth, urls, importParms.pruneSameURLs(), importParms.getBrowserWidth(), importParms.getBrowserHeight(), importParms.captureImages());
                ThreadManager.startNewThread(workThread);
                return true;
            }
            return false;
        }
    };
}

Also used : Design(edu.cmu.cs.hcii.cogtool.model.Design) ITaskDesign(edu.cmu.cs.hcii.cogtool.model.Project.ITaskDesign) IListenerAction(edu.cmu.cs.hcii.cogtool.util.IListenerAction) URLCrawlEntry(edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry) ProjectInteraction(edu.cmu.cs.hcii.cogtool.ui.ProjectInteraction) DesignSelectionState(edu.cmu.cs.hcii.cogtool.ui.DesignSelectionState) DoublePoint(edu.cmu.cs.hcii.cogtool.model.DoublePoint)

Example 2 with URLCrawlEntry

use of edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry in project cogtool by cogtool.

the class WebCrawlImportDialog method saveSettings.

protected void saveSettings() {
    String comboValue = designCombo.getText();
    designName = CREATE_NEW_DESIGN.equals(comboValue) ? null : comboValue;
    browserWidth = getIntegerValue(browserWidthEntry, DEFAULT_BROWSER_WIDTH);
    browserHeight = getIntegerValue(browserHeightEntry, DEFAULT_BROWSER_HEIGHT);
    maxPages = getIntegerValue(maxPagesToImport, USE_SYSTEM_DEFAULT);
    urlsToCrawl = new ArrayList<URLCrawlEntry>();
    String url = urlText.getText();
    /*The last character of the restricted domain must be removed so that the restricted domain
		 * of "www.cmu.edu/" will also include www.cmu.edu in the webcrawl
		 */
    urlsToCrawl.add(new URLCrawlEntry(url, maximumDepth, restrictedDomain.substring(0, restrictedDomain.length() - 1)));
    if (isValidURL2) {
        url = urlText2.getText();
        urlsToCrawl.add(new URLCrawlEntry(url, maximumDepth2, restrictedDomain2.substring(0, restrictedDomain2.length() - 1)));
        if (isValidURL3) {
            url = urlText3.getText();
            urlsToCrawl.add(new URLCrawlEntry(url, maximumDepth3, restrictedDomain3.substring(0, restrictedDomain3.length() - 1)));
        }
    }
    if (pruneCrawlOnSame != null) {
        pruneSameURLsFlag = pruneCrawlOnSame.getSelection();
    }
    capturePageImages = capturePageImagesOption.getSelection();
}

Also used : URLCrawlEntry(edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry)

Example 3 with URLCrawlEntry

use of edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry in project cogtool by cogtool.

the class WebCrawler method crawlWeb.

/**
     * Crawl the URL specifications contained by the given list -- the member
     * objects should be instances of URLCrawlEntry or a subclass.
     * The number of visits will be limited to maxURLs,
     * using the given default depth.  Visits are performed breadth-first.
     *
     * Fetch resulting page descriptions afterward via getCrawledURLs().
     * Each call to crawlWeb will add new descriptions to the collection.
     *
     * @param crawlEntries the list of URLCrawlEntry instances
     * @param defaultDepth the default depth for URLs without specified depths
     * @param maxURLs the maximum number of valid pages to visit
     */
public void crawlWeb(List<URLCrawlEntry> crawlEntries, int defaultDepth, int maxURLs) {
    int numURLsCrawled = 0;
    // FIFO tracking of URLCrawlEntry's yet to crawl
    for (URLCrawlEntry entry : crawlEntries) {
        if (shouldCrawlLink(entry)) {
            urlsToCrawl.add(entry);
        }
    }
    //     if the cancel button has been pushed)
    while (!urlsToCrawl.isEmpty() && (numURLsCrawled < maxURLs) && crawlMayContinue()) {
        // important to pick it off the front of the list (truly implement a fifo),
        // so we do a breadth first walk
        URLCrawlEntry nextEntry = urlsToCrawl.removeFirst();
        // Strip #... fragment from the URL
        // only for root urls, anything lower down will have already been stripped
        nextEntry.stripFragment();
        if (nextEntry.isEmpty()) {
            // string is now empty!
            continue;
        }
        // see below for part that makes relative links absolute.
        try {
            nextEntry.ensureAbsolute();
        } catch (IOException ex) {
            throw new URLParseError(nextEntry.getURL(), ex);
        }
        // implementation checks that the entry hasn't already been seen.
        if (crawlNeeded(nextEntry)) {
            PageInfo urlPage = fetchPage(nextEntry);
            // If the page is acceptable, record and count it.
            if (urlPage != null) {
                // Update the count fetched this time
                numURLsCrawled++;
                // Record page's absolute URL; used by crawlNeeded()
                // to decide that this URL no longer needs to be fetched.
                crawledURLs.put(nextEntry.getURL(), urlPage);
                // If the depth for this page allows more crawling,
                // add its child links to the queue.
                int toDepth = nextEntry.getToDepth();
                if (toDepth == URLCrawlEntry.USE_DEFAULT_DEPTH) {
                    // can only happen at top level of the tree being walked
                    toDepth = defaultDepth;
                }
                if (toDepth > 0) {
                    Iterator<URLLabeledLink> newLinks = urlPage.links.iterator();
                    URL contextURL = null;
                    while (newLinks.hasNext()) {
                        URLLabeledLink newLink = newLinks.next();
                        newLink.setDomain(nextEntry.getDomain());
                        // Again, the #... fragment is useless to us
                        newLink.stripFragment();
                        // for protocol scheme check inside shouldCrawlLink
                        if (!newLink.isAbsolute()) {
                            if (contextURL == null) {
                                try {
                                    // Get the URL of the current page
                                    // to use as the context for all
                                    // relative links that it contains
                                    contextURL = new URL(urlPage.url);
                                } catch (IOException ex) {
                                    throw new URLParseError(urlPage.url, ex);
                                }
                            }
                            // path issues
                            try {
                                URL absoluteURL = new URL(contextURL, newLink.getURL());
                                newLink.setURL(absoluteURL.toString());
                            } catch (IOException ex) {
                                throw new URLParseError(newLink.getURL(), ex);
                            }
                        }
                        newLink.setToDepth(toDepth - 1);
                        // should be crawled,
                        if (shouldCrawlLink(newLink)) {
                            urlsToCrawl.add(newLink);
                        }
                    }
                }
            }
        }
    }
}

Also used : URLLabeledLink(edu.cmu.cs.hcii.cogtool.model.URLLabeledLink) URLCrawlEntry(edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry) IOException(java.io.IOException) URL(java.net.URL)

Aggregations

URLCrawlEntry (edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry)3 Design (edu.cmu.cs.hcii.cogtool.model.Design)1 DoublePoint (edu.cmu.cs.hcii.cogtool.model.DoublePoint)1 ITaskDesign (edu.cmu.cs.hcii.cogtool.model.Project.ITaskDesign)1 URLLabeledLink (edu.cmu.cs.hcii.cogtool.model.URLLabeledLink)1 DesignSelectionState (edu.cmu.cs.hcii.cogtool.ui.DesignSelectionState)1 ProjectInteraction (edu.cmu.cs.hcii.cogtool.ui.ProjectInteraction)1 IListenerAction (edu.cmu.cs.hcii.cogtool.util.IListenerAction)1 IOException (java.io.IOException)1 URL (java.net.URL)1