use of org.opensextant.xtext.collectors.web.DefaultWebCrawl in project Xponents by OpenSextant.
the class WebCrawl method main.
public static void main(String[] args) {
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("WebCrawl", args, "do:l:f:");
String o = null;
String webSite = null;
boolean currentDirOnly = false;
String inputFile = null;
try {
int c;
while ((c = opts.getopt()) != -1) {
switch(c) {
case 'l':
webSite = opts.getOptarg();
break;
case 'f':
inputFile = opts.getOptarg();
break;
case 'o':
o = opts.getOptarg();
break;
case 'd':
currentDirOnly = true;
break;
default:
usage();
System.exit(-1);
}
}
} catch (Exception execErr) {
usage();
execErr.printStackTrace();
System.exit(-1);
}
WebCrawl me = new WebCrawl();
try {
FileUtility.makeDirectory(o);
XText conv = new XText();
/*
* Following setup for conversion and crawl says this:
*
* - Save converted data at cacheDir
* - The inputDir == cacheDir, so conversions are "saved with input"
*
* This is because the crawler below is crawling and downloading HTML pages to the "inputDir"
* For simplicity sake, the pages are being converted and cached in that same hierarchical folder.
*
* Alternatively, crawler can save content to A, convert and cache in B., we A != B and B is not inside A, etc.
* /path/to/A
* /path/to/B
* for example.
*/
File cacheDir = new File(o);
conv.enableSaving(true);
// Order of setting is important. Since cacheDir == input & saving with input,
// Then no need to set a separate cacheDir.
//conv.getPathManager().setConversionCache(cacheDir.getAbsolutePath());
conv.getPathManager().setInputRoot(cacheDir);
conv.getPathManager().enableSaveWithInput(true);
conv.setup();
DefaultWebCrawl crawl = new DefaultWebCrawl(webSite, o);
crawl.setAllowCurrentDirOnly(currentDirOnly);
String proxyHost = System.getProperty("http.proxyHost");
if (proxyHost != null) {
crawl.setProxy(proxyHost + ":80");
}
crawl.configure();
// Set these items
crawl.setConverter(conv);
crawl.setListener(me);
// Go do it.
if (inputFile != null) {
File f = new File(inputFile);
if (f.exists()) {
// parse links from file as if the file was pulled from website -l link.
crawl.collect(f);
} else {
System.err.println("File does not exist F=" + inputFile);
}
} else {
crawl.collect();
}
} catch (Exception err) {
err.printStackTrace();
}
}
Aggregations