use of edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry in project cogtool by cogtool.
the class ProjectController method createImportWebCrawlAction.
protected IListenerAction createImportWebCrawlAction() {
return new IListenerAction() {
public Class<?> getParameterClass() {
return DesignSelectionState.class;
}
public boolean performAction(Object prms) {
DesignSelectionState selection = (DesignSelectionState) prms;
ProjectInteraction.IWebCrawlImport importParms = interaction.requestWebCrawlParms(project, WebCrawler.DEFAULT_MAX_TO_CRAWL);
if (importParms != null) {
String designName = importParms.getDesignName();
Design design;
if (designName == null) {
ProjectInteraction.DesignRequestData requestData = requestNewDesignParms(false);
if (requestData == null) {
// canceled!
return false;
}
design = new Design(requestData.designName, requestData.deviceTypes);
} else {
design = project.getDesign(designName);
}
int defaultDepth = importParms.getDefaultDepth();
List<URLCrawlEntry> urls = importParms.getURLsToCrawl();
if ((urls == null) || (urls.size() == 0)) {
interaction.reportProblem(ImportWebCrawlThread.IMPORT_WEB_CRAWL, noURLsToCrawlError);
return false;
}
// If new, indicate that the work thread should add the
// new design to the project when it completes;
// -1 indicates that the design is *not* new.
int beforeIndex = ImportWebCrawlThread.EXISTING_DESIGN;
if (designName == null) {
Design selectedDesign = selection.getSelectedDesign();
beforeIndex = (selectedDesign != null) ? (project.getDesigns().indexOf(selectedDesign) + 1) : project.getDesigns().size();
}
ImportWebCrawlThread workThread = new ImportWebCrawlThread(interaction, undoMgr, project, design, beforeIndex, importParms.getMaxPages(), defaultDepth, urls, importParms.pruneSameURLs(), importParms.getBrowserWidth(), importParms.getBrowserHeight(), importParms.captureImages());
ThreadManager.startNewThread(workThread);
return true;
}
return false;
}
};
}
use of edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry in project cogtool by cogtool.
the class WebCrawlImportDialog method saveSettings.
protected void saveSettings() {
String comboValue = designCombo.getText();
designName = CREATE_NEW_DESIGN.equals(comboValue) ? null : comboValue;
browserWidth = getIntegerValue(browserWidthEntry, DEFAULT_BROWSER_WIDTH);
browserHeight = getIntegerValue(browserHeightEntry, DEFAULT_BROWSER_HEIGHT);
maxPages = getIntegerValue(maxPagesToImport, USE_SYSTEM_DEFAULT);
urlsToCrawl = new ArrayList<URLCrawlEntry>();
String url = urlText.getText();
/*The last character of the restricted domain must be removed so that the restricted domain
* of "www.cmu.edu/" will also include www.cmu.edu in the webcrawl
*/
urlsToCrawl.add(new URLCrawlEntry(url, maximumDepth, restrictedDomain.substring(0, restrictedDomain.length() - 1)));
if (isValidURL2) {
url = urlText2.getText();
urlsToCrawl.add(new URLCrawlEntry(url, maximumDepth2, restrictedDomain2.substring(0, restrictedDomain2.length() - 1)));
if (isValidURL3) {
url = urlText3.getText();
urlsToCrawl.add(new URLCrawlEntry(url, maximumDepth3, restrictedDomain3.substring(0, restrictedDomain3.length() - 1)));
}
}
if (pruneCrawlOnSame != null) {
pruneSameURLsFlag = pruneCrawlOnSame.getSelection();
}
capturePageImages = capturePageImagesOption.getSelection();
}
use of edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry in project cogtool by cogtool.
the class WebCrawler method crawlWeb.
/**
* Crawl the URL specifications contained by the given list -- the member
* objects should be instances of URLCrawlEntry or a subclass.
* The number of visits will be limited to maxURLs,
* using the given default depth. Visits are performed breadth-first.
*
* Fetch resulting page descriptions afterward via getCrawledURLs().
* Each call to crawlWeb will add new descriptions to the collection.
*
* @param crawlEntries the list of URLCrawlEntry instances
* @param defaultDepth the default depth for URLs without specified depths
* @param maxURLs the maximum number of valid pages to visit
*/
public void crawlWeb(List<URLCrawlEntry> crawlEntries, int defaultDepth, int maxURLs) {
int numURLsCrawled = 0;
// FIFO tracking of URLCrawlEntry's yet to crawl
for (URLCrawlEntry entry : crawlEntries) {
if (shouldCrawlLink(entry)) {
urlsToCrawl.add(entry);
}
}
// if the cancel button has been pushed)
while (!urlsToCrawl.isEmpty() && (numURLsCrawled < maxURLs) && crawlMayContinue()) {
// important to pick it off the front of the list (truly implement a fifo),
// so we do a breadth first walk
URLCrawlEntry nextEntry = urlsToCrawl.removeFirst();
// Strip #... fragment from the URL
// only for root urls, anything lower down will have already been stripped
nextEntry.stripFragment();
if (nextEntry.isEmpty()) {
// string is now empty!
continue;
}
// see below for part that makes relative links absolute.
try {
nextEntry.ensureAbsolute();
} catch (IOException ex) {
throw new URLParseError(nextEntry.getURL(), ex);
}
// implementation checks that the entry hasn't already been seen.
if (crawlNeeded(nextEntry)) {
PageInfo urlPage = fetchPage(nextEntry);
// If the page is acceptable, record and count it.
if (urlPage != null) {
// Update the count fetched this time
numURLsCrawled++;
// Record page's absolute URL; used by crawlNeeded()
// to decide that this URL no longer needs to be fetched.
crawledURLs.put(nextEntry.getURL(), urlPage);
// If the depth for this page allows more crawling,
// add its child links to the queue.
int toDepth = nextEntry.getToDepth();
if (toDepth == URLCrawlEntry.USE_DEFAULT_DEPTH) {
// can only happen at top level of the tree being walked
toDepth = defaultDepth;
}
if (toDepth > 0) {
Iterator<URLLabeledLink> newLinks = urlPage.links.iterator();
URL contextURL = null;
while (newLinks.hasNext()) {
URLLabeledLink newLink = newLinks.next();
newLink.setDomain(nextEntry.getDomain());
// Again, the #... fragment is useless to us
newLink.stripFragment();
// for protocol scheme check inside shouldCrawlLink
if (!newLink.isAbsolute()) {
if (contextURL == null) {
try {
// Get the URL of the current page
// to use as the context for all
// relative links that it contains
contextURL = new URL(urlPage.url);
} catch (IOException ex) {
throw new URLParseError(urlPage.url, ex);
}
}
// path issues
try {
URL absoluteURL = new URL(contextURL, newLink.getURL());
newLink.setURL(absoluteURL.toString());
} catch (IOException ex) {
throw new URLParseError(newLink.getURL(), ex);
}
}
newLink.setToDepth(toDepth - 1);
// should be crawled,
if (shouldCrawlLink(newLink)) {
urlsToCrawl.add(newLink);
}
}
}
}
}
}
}
Aggregations