use of edu.cmu.cs.hcii.cogtool.model.URLLabeledLink in project cogtool by cogtool.
the class WebCrawler method crawlWeb.
/**
* Crawl the URL specifications contained by the given list -- the member
* objects should be instances of URLCrawlEntry or a subclass.
* The number of visits will be limited to maxURLs,
* using the given default depth. Visits are performed breadth-first.
*
* Fetch resulting page descriptions afterward via getCrawledURLs().
* Each call to crawlWeb will add new descriptions to the collection.
*
* @param crawlEntries the list of URLCrawlEntry instances
* @param defaultDepth the default depth for URLs without specified depths
* @param maxURLs the maximum number of valid pages to visit
*/
public void crawlWeb(List<URLCrawlEntry> crawlEntries, int defaultDepth, int maxURLs) {
int numURLsCrawled = 0;
// FIFO tracking of URLCrawlEntry's yet to crawl
for (URLCrawlEntry entry : crawlEntries) {
if (shouldCrawlLink(entry)) {
urlsToCrawl.add(entry);
}
}
// if the cancel button has been pushed)
while (!urlsToCrawl.isEmpty() && (numURLsCrawled < maxURLs) && crawlMayContinue()) {
// important to pick it off the front of the list (truly implement a fifo),
// so we do a breadth first walk
URLCrawlEntry nextEntry = urlsToCrawl.removeFirst();
// Strip #... fragment from the URL
// only for root urls, anything lower down will have already been stripped
nextEntry.stripFragment();
if (nextEntry.isEmpty()) {
// string is now empty!
continue;
}
// see below for part that makes relative links absolute.
try {
nextEntry.ensureAbsolute();
} catch (IOException ex) {
throw new URLParseError(nextEntry.getURL(), ex);
}
// implementation checks that the entry hasn't already been seen.
if (crawlNeeded(nextEntry)) {
PageInfo urlPage = fetchPage(nextEntry);
// If the page is acceptable, record and count it.
if (urlPage != null) {
// Update the count fetched this time
numURLsCrawled++;
// Record page's absolute URL; used by crawlNeeded()
// to decide that this URL no longer needs to be fetched.
crawledURLs.put(nextEntry.getURL(), urlPage);
// If the depth for this page allows more crawling,
// add its child links to the queue.
int toDepth = nextEntry.getToDepth();
if (toDepth == URLCrawlEntry.USE_DEFAULT_DEPTH) {
// can only happen at top level of the tree being walked
toDepth = defaultDepth;
}
if (toDepth > 0) {
Iterator<URLLabeledLink> newLinks = urlPage.links.iterator();
URL contextURL = null;
while (newLinks.hasNext()) {
URLLabeledLink newLink = newLinks.next();
newLink.setDomain(nextEntry.getDomain());
// Again, the #... fragment is useless to us
newLink.stripFragment();
// for protocol scheme check inside shouldCrawlLink
if (!newLink.isAbsolute()) {
if (contextURL == null) {
try {
// Get the URL of the current page
// to use as the context for all
// relative links that it contains
contextURL = new URL(urlPage.url);
} catch (IOException ex) {
throw new URLParseError(urlPage.url, ex);
}
}
// path issues
try {
URL absoluteURL = new URL(contextURL, newLink.getURL());
newLink.setURL(absoluteURL.toString());
} catch (IOException ex) {
throw new URLParseError(newLink.getURL(), ex);
}
}
newLink.setToDepth(toDepth - 1);
// should be crawled,
if (shouldCrawlLink(newLink)) {
urlsToCrawl.add(newLink);
}
}
}
}
}
}
}
use of edu.cmu.cs.hcii.cogtool.model.URLLabeledLink in project cogtool by cogtool.
the class ImportWebCrawlThread method doneCallback.
/**
* For each page visited and parsed, create a corresponding Frame.
* For each child link, create a corresponding Widget and Transition.
*/
@Override
public void doneCallback() {
// Performed by the main UI thread
try {
// If an exception was thrown during the import, display error here
if (RcvrExceptionHandler.recoverWorkThread(this, interaction)) {
return;
}
if (isCanceled()) {
return;
}
DemoStateManager demoStateMgr = DemoStateManager.getStateManager(project, design);
if (isPaused()) {
DefaultCmd.setAttribute(design, demoStateMgr, WidgetAttributes.PAUSED_WEB_CRAWL_ATTR, importWeb.getURLsToCrawl(), interaction, editSequence);
}
// -1 means that the design already is part of the project
if (insertBeforeIndex != EXISTING_DESIGN) {
ProjectCmd.addNewDesign(project, design, insertBeforeIndex, IMPORT_WEB_DESIGN, editSequence);
}
Collection<PageInfo> crawledURLs = importWeb.getCrawledURLs();
Iterator<PageInfo> pagesVisited = crawledURLs.iterator();
Set<DeviceType> deviceTypes = design.getDeviceTypes();
// Map (Link) IWidget to URL
Map<IWidget, String> neededTransitions = new HashMap<IWidget, String>();
int minFrameWidth = DesignUtil.getFrameMinWidth();
int minFrameHeight = DesignUtil.getFrameMinHeight();
double frameScale = DesignUtil.getFrameScaleFactor();
DesignUtil.IFrameSituator frameSituator = new DesignUtil.ByRowFrameSituator(0.0, 0.0, 16.0, 16.0, minFrameWidth, minFrameHeight, CogToolPref.FRAMES_PER_ROW.getInt(), frameScale);
while (pagesVisited.hasNext()) {
ImportWebURL.ImportPageInfo page = (ImportWebURL.ImportPageInfo) pagesVisited.next();
Frame newFrame = new Frame(page.url, deviceTypes);
knownFrames.put(page.url, newFrame);
if (page.background != null) {
DoubleRectangle bds = new DoubleRectangle(page.bkgImageX, page.bkgImageY, page.bkgImageWidth, page.bkgImageHeight);
newFrame.setBackgroundImage(page.background, bds);
}
int linkCount = 0;
Iterator<URLLabeledLink> links = page.links.iterator();
while (links.hasNext()) {
URLPositionedLink link = (URLPositionedLink) links.next();
// of the page)
if ((Math.round(link.width) == 0.0) || (Math.round(link.height) == 0.0)) {
continue;
}
IWidget linkWidget = new Widget(new DoubleRectangle(link.left, link.top, link.width, link.height), WidgetType.Link);
linkWidget.setName("Widget " + Integer.toString(++linkCount));
linkWidget.setTitle(StringUtil.trimWhitespace(link.getLabel()));
newFrame.addWidget(linkWidget);
if (deviceTypes.contains(DeviceType.Mouse)) {
String linkURL = link.getURL();
Frame targetFrame = knownFrames.get(linkURL);
if (targetFrame != null) {
Transition t = new Transition(linkWidget, targetFrame, buildLinkAction());
IUndoableEdit edit = DesignEditorCmd.addTransition(demoStateMgr, t);
editSequence.addEdit(edit);
} else {
// Have to handle this in the second pass
neededTransitions.put(linkWidget, linkURL);
}
}
}
Frame oldFrame = design.getFrame(newFrame.getName());
if (pruneSameURLs) {
if (oldFrame != null) {
makeFrameNameUnique(newFrame);
}
} else {
// If oldFrame exists, remove but keep incident transitions
if (oldFrame != null) {
Set<Transition> transitions = oldFrame.getIncidentTransitions();
synchronized (transitions) {
// them without upsetting the iterator
for (Transition transition : new ArrayList<Transition>(transitions)) {
DesignEditorCmd.changeTransitionTarget(demoStateMgr, transition, newFrame, editSequence);
}
//transitions=transitions2;
// Can't delete the transitive closure from here...sigh
DesignEditorCmd.deleteFrame(project, design, demoStateMgr, oldFrame, ProjectLID.ImportWebCrawl, editSequence);
}
}
}
frameSituator.situateNextFrame(newFrame);
DesignEditorCmd.addFrame(project, design, demoStateMgr, newFrame, editSequence);
}
// Each entry is IWidget --> URL string
for (Map.Entry<IWidget, String> checkTransition : neededTransitions.entrySet()) {
String transitionURL = checkTransition.getValue();
Frame targetFrame = knownFrames.get(transitionURL);
// processing is done (i.e., added during background processing).
if (targetFrame == null) {
targetFrame = design.getFrame(transitionURL);
}
// May just not be there; can't link if it's not there!
if (targetFrame != null) {
IWidget linkWidget = checkTransition.getKey();
Transition t = new Transition(linkWidget, targetFrame, buildLinkAction());
IUndoableEdit edit = DesignEditorCmd.addTransition(demoStateMgr, t);
editSequence.addEdit(edit);
}
}
editSequence.end();
undoMgr.addEdit(editSequence);
} finally {
// Recover resources.
importWeb.dispose();
super.doneCallback();
}
}
Aggregations