use of org.archive.net.UURI in project Asqatasun by Asqatasun.
the class AsqatasunWriterProcessor method innerProcess.
@Override
protected void innerProcess(CrawlURI curi) {
Logger.getLogger(this.getClass()).debug("inner process? " + curi.getURI());
// Current URI.
UURI uuri = curi.getUURI();
// Only http and https schemes are supported.
String scheme = uuri.getScheme();
if (!HTTP_PREFIX.equalsIgnoreCase(scheme) && !HTTPS_PREFIX.equalsIgnoreCase(scheme)) {
return;
}
RecordingInputStream recis = curi.getRecorder().getRecordedInput();
if (0L == recis.getResponseContentLength()) {
return;
}
if (curi.getFetchStatus() != HTTP_SUCCESS_RETURN_CODE) {
contentWriter.computeAndPersistUnsuccessfullFetchedResource(curi);
return;
}
try {
contentWriter.computeAndPersistSuccessfullFetchedResource(curi, recis);
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
}
IOUtils.closeQuietly(recis);
}
use of org.archive.net.UURI in project Asqatasun by Asqatasun.
the class AsqatasunTextSeedModule method seedLine.
/**
* Handle a read line that is probably a seed.
*
* @param uri String seed-containing line
*/
protected void seedLine(String uri) {
if (!uri.matches("[a-zA-Z][\\w+\\-]+:.*")) {
// Rfc2396 s3.1 scheme,
// minus '.'
// Does not begin with scheme, so try http://
uri = "http://" + uri;
}
try {
UURI uuri = UURIFactory.getInstance(uri);
CrawlURI curi = new CrawlURI(uuri);
curi.setSeed(true);
curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
if (getSourceTagSeeds()) {
curi.setSourceTag(curi.toString());
}
publishAddedSeed(curi);
} catch (URIException e) {
// try as nonseed line as fallback
nonseedLine(uri);
}
}
Aggregations