use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class Ftp method main.
/**
* For debugging.
*/
public static void main(String[] args) throws Exception {
int timeout = Integer.MIN_VALUE;
int maxContentLength = Integer.MIN_VALUE;
String logLevel = "info";
boolean followTalk = false;
boolean keepConnection = false;
boolean dumpContent = false;
String urlString = null;
String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-logLevel")) {
logLevel = args[++i];
} else if (args[i].equals("-followTalk")) {
followTalk = true;
} else if (args[i].equals("-keepConnection")) {
keepConnection = true;
} else if (args[i].equals("-timeout")) {
timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-maxContentLength")) {
maxContentLength = Integer.parseInt(args[++i]);
} else if (args[i].equals("-dumpContent")) {
dumpContent = true;
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else {
urlString = args[i];
}
}
Ftp ftp = new Ftp();
ftp.setFollowTalk(followTalk);
ftp.setKeepConnection(keepConnection);
if (// set timeout
timeout != Integer.MIN_VALUE)
ftp.setTimeout(timeout);
if (// set maxContentLength
maxContentLength != Integer.MIN_VALUE)
ftp.setMaxContentLength(maxContentLength);
// set log level
// LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
ftp = null;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class FtpRobotRulesParser method getRobotRulesSet.
/**
* The hosts for which the caching of robots rules is yet to be done, it sends
* a Ftp request to the host corresponding to the {@link URL} passed, gets
* robots file, parses the rules and caches the rules object to avoid re-work
* in future.
*
* @param ftp
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
// normalize to lower
String protocol = url.getProtocol().toLowerCase();
// case
// normalize to lower case
String host = url.getHost().toLowerCase();
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
if (robotRules != null) {
// cached rule
return robotRules;
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
} else {
try {
Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
ProtocolStatus status = output.getStatus();
if (robotsTxtContent != null) {
robotsTxtContent.add(output.getContent());
}
if (status.getCode() == ProtocolStatus.SUCCESS) {
robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
} else {
// use default rules
robotRules = EMPTY_RULES;
}
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
}
}
if (cacheRule)
// cache rules for host
CACHE.put(protocol + ":" + host, robotRules);
return robotRules;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestCrawlDbStates method testCrawlDbStateTransitionMatrix.
/**
* Test the matrix of state transitions:
* <ul>
* <li>for all available {@link FetchSchedule} implementations</li>
* <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
* <li>for every possible fetch status</li>
* <li>and zero or more (0-3) additional in-links</li>
* </ul>
* call {@literal updatedb} and check whether the resulting CrawlDb status is
* the expected one.
*/
@Test
public void testCrawlDbStateTransitionMatrix() {
LOG.info("Test CrawlDatum state transitions");
Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context = CrawlDBTestUtil.createContext();
Configuration conf = context.getConfiguration();
CrawlDbUpdateUtil updateDb = null;
try {
updateDb = new CrawlDbUpdateUtil(new CrawlDbReducer(), context);
} catch (IOException e) {
e.printStackTrace();
}
int retryMax = conf.getInt("db.fetch.retry.max", 3);
for (String sched : schedules) {
LOG.info("Testing state transitions with " + sched);
conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(conf);
for (int i = 0; i < fetchDbStatusPairs.length; i++) {
byte fromDbStatus = fetchDbStatusPairs[i][1];
for (int j = 0; j < fetchDbStatusPairs.length; j++) {
byte fetchStatus = fetchDbStatusPairs[j][0];
CrawlDatum fromDb = null;
if (fromDbStatus == -1) {
// nothing yet in CrawlDb
// CrawlDatum added by FreeGenerator or via outlink
} else {
fromDb = new CrawlDatum();
fromDb.setStatus(fromDbStatus);
// initialize fetchInterval:
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
}
// expected db status
byte toDbStatus = fetchDbStatusPairs[j][1];
if (fetchStatus == -1) {
if (fromDbStatus == -1) {
// nothing fetched yet: new document detected via outlink
toDbStatus = STATUS_DB_UNFETCHED;
} else {
// nothing fetched but new inlinks detected: status is unchanged
toDbStatus = fromDbStatus;
}
} else if (fetchStatus == STATUS_FETCH_RETRY) {
// a simple test of fetch_retry (without retries)
if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
toDbStatus = STATUS_DB_UNFETCHED;
} else {
toDbStatus = STATUS_DB_GONE;
}
}
String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>" : getStatusName(fromDbStatus));
String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>" : CrawlDatum.getStatusName(fetchStatus));
LOG.info(fromDbStatusName + " + " + fetchStatusName + " => " + getStatusName(toDbStatus));
List<CrawlDatum> values = new ArrayList<CrawlDatum>();
for (int l = 0; l <= 2; l++) {
// number of additional in-links
CrawlDatum fetch = null;
if (fetchStatus == -1) {
// nothing fetched, need at least one in-link
if (l == 0)
continue;
} else {
fetch = new CrawlDatum();
if (fromDb != null) {
fetch.set(fromDb);
} else {
// not yet in CrawlDb: added by FreeGenerator
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
}
fetch.setStatus(fetchStatus);
fetch.setFetchTime(System.currentTimeMillis());
}
if (fromDb != null)
values.add(fromDb);
if (fetch != null)
values.add(fetch);
for (int n = 0; n < l; n++) {
values.add(linked);
}
List<CrawlDatum> res = updateDb.update(values);
if (res.size() != 1) {
fail("CrawlDb update didn't result in one single CrawlDatum per URL");
continue;
}
byte status = res.get(0).getStatus();
if (status != toDbStatus) {
fail("CrawlDb update for " + fromDbStatusName + " and " + fetchStatusName + " and " + l + " inlinks results in " + getStatusName(status) + " (expected: " + getStatusName(toDbStatus) + ")");
}
values.clear();
}
}
}
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestIndexingFilters method testNonExistingIndexingFilter.
/**
* Test behaviour when defined filter does not exist.
*
* @throws IndexingException
*/
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "NonExistingFilter";
String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters = new IndexingFilters(conf);
filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestSegmentMergerCrawlDatums method createSegment.
protected void createSegment(Path segment, byte status, boolean fetch, boolean redirect) throws Exception {
LOG.info("\nSegment: " + segment.toString());
// The URL of our main record
String url = "http://nutch.apache.org/";
// The URL of our redirecting URL
String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";
// Our value
CrawlDatum value = new CrawlDatum();
// Path of the segment's crawl_fetch directory
Path crawlFetchPath = new Path(new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
// Get a writer for map files containing <Text,CrawlDatum> pairs
Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, wValueOpt);
// linked datum when merging
if (redirect) {
// We're writing our our main record URL with status linked
LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
value = new CrawlDatum();
value.setStatus(CrawlDatum.STATUS_LINKED);
writer.append(new Text(url), value);
}
// Whether we're fetching now
if (fetch) {
LOG.info(url + " > " + CrawlDatum.getStatusName(status));
// Set the status
value.setStatus(status);
// Write the pair and ok
writer.append(new Text(url), value);
}
// Whether we're handing a redirect now
if (redirect) {
// And the redirect URL with redirect status, pointing to our main URL
LOG.info(redirectUrl + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
writer.append(new Text(redirectUrl), value);
}
// Close the stuff
writer.close();
}
Aggregations