use of org.archive.modules.CrawlURI in project Asqatasun by Asqatasun.
the class AsqatasunTextSeedModule method seedLine.
/**
* Handle a read line that is probably a seed.
*
* @param uri String seed-containing line
*/
protected void seedLine(String uri) {
if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) {
// Rfc2396 s3.1 scheme,
// minus '.'
// Does not begin with scheme, so try http://
uri = "http://" + uri;
}
try {
UURI uuri = UURIFactory.getInstance(uri);
CrawlURI curi = new CrawlURI(uuri);
curi.setSeed(true);
curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
if (getSourceTagSeeds()) {
curi.setSourceTag(curi.toString());
}
publishAddedSeed(curi);
} catch (URIException e) {
// try as nonseed line as fallback
nonseedLine(uri);
}
}
Aggregations