use of org.apache.nutch.net.URLNormalizers in project nutch by apache.
the class URLPartitioner method configure.
public void configure(Job job) {
Configuration conf = job.getConfiguration();
seed = conf.getInt("partition.url.seed", 0);
mode = conf.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
// check that the mode is known
if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN) && !mode.equals(PARTITION_MODE_HOST)) {
LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
mode = PARTITION_MODE_HOST;
}
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_PARTITION);
}
use of org.apache.nutch.net.URLNormalizers in project nutch by apache.
the class ParserChecker method run.
@Override
public int run(String[] args) throws Exception {
String url = null;
String usage = //
"Usage:\n" + //
" ParserChecker [OPTIONS] <url>\n" + //
" Fetch single URL and parse it\n" + //
" ParserChecker [OPTIONS] -stdin\n" + //
" Read URLs to be parsed from stdin\n" + //
" ParserChecker [OPTIONS] -listen <port> [-keepClientCnxOpen]\n" + //
" Listen on <port> for URLs to be parsed\n" + //
"Options:\n" + //
" -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" + //
" \t(a generic Hadoop option to be passed\n" + " \t before other command-specific options)\n" + //
" -normalize \tnormalize URLs\n" + //
" -followRedirects\tfollow redirects when fetching URL\n" + //
" -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" + //
" -dumpText \talso show the plain-text extracted by parsers\n" + //
" -forceAs <mimeType>\tforce parsing as <mimeType>\n" + " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
// Print help when no args given
if (args.length < 1) {
System.err.println(usage);
System.exit(-1);
}
// initialize plugins early to register URL stream handlers to support
// custom protocol implementations
PluginRepository.get(getConf());
int numConsumed;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-normalize")) {
normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
} else if (args[i].equals("-followRedirects")) {
followRedirects = true;
} else if (args[i].equals("-checkRobotsTxt")) {
checkRobotsTxt = true;
} else if (args[i].equals("-forceAs")) {
forceAsContentType = args[++i];
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
int firstEquals = nextOne.indexOf("=");
if (firstEquals != -1) {
k = nextOne.substring(0, firstEquals);
v = nextOne.substring(firstEquals + 1);
} else
k = nextOne;
metadata.put(k, v);
} else if ((numConsumed = super.parseArgs(args, i)) > 0) {
i += numConsumed - 1;
} else if (i != args.length - 1) {
System.err.println("ERR: Not a recognized argument: " + args[i]);
System.err.println(usage);
System.exit(-1);
} else {
url = args[i];
}
}
scfilters = new ScoringFilters(getConf());
if (url != null) {
return super.processSingle(url);
} else {
// Start listening
return super.run();
}
}
use of org.apache.nutch.net.URLNormalizers in project nutch by apache.
the class FeedParser method setConf.
/**
* Sets the {@link Configuration} object for this {@link Parser}. This
* {@link Parser} expects the following configuration properties to be set:
*
* <ul>
* <li>URLNormalizers - properties in the configuration object to set up the
* default url normalizers.</li>
* <li>URLFilters - properties in the configuration object to set up the
* default url filters.</li>
* </ul>
*
* @param conf
* The Hadoop {@link Configuration} object to use to configure this
* {@link Parser}.
*/
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.parserFactory = new ParserFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
this.filters = new URLFilters(conf);
this.defaultEncoding = conf.get("parser.character.encoding.default", "windows-1252");
}
use of org.apache.nutch.net.URLNormalizers in project nutch by apache.
the class URLPartitioner method setConf.
@Override
public void setConf(Configuration conf) {
this.conf = conf;
seed = conf.getInt("partition.url.seed", 0);
mode = conf.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
// check that the mode is known
if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN) && !mode.equals(PARTITION_MODE_HOST)) {
LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
mode = PARTITION_MODE_HOST;
}
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_PARTITION);
}
use of org.apache.nutch.net.URLNormalizers in project nutch by apache.
the class IndexingFiltersChecker method run.
@Override
public int run(String[] args) throws Exception {
String url = null;
String usage = //
"Usage:\n" + //
" IndexingFiltersChecker [OPTIONS] <url>\n" + //
" Fetch single URL and index it\n" + //
" IndexingFiltersChecker [OPTIONS] -stdin\n" + //
" Read URLs to be indexed from stdin\n" + //
" IndexingFiltersChecker [OPTIONS] -listen <port> [-keepClientCnxOpen]\n" + //
" Listen on <port> for URLs to be indexed\n" + //
"Options:\n" + //
" -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" + //
" \t(a generic Hadoop option to be passed\n" + " \t before other command-specific options)\n" + //
" -normalize \tnormalize URLs\n" + //
" -followRedirects\tfollow redirects when fetching URL\n" + //
" -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" + // "
" -dumpText \tshow the entire plain-text content,\n" + //
" \tnot only the first 100 characters\n" + //
" -doIndex \tpass document to configured index writers\n" + //
" \tand let them index it\n" + " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
// Print help when no args given
if (args.length < 1) {
System.err.println(usage);
System.exit(-1);
}
// read property "doIndex" for back-ward compatibility
doIndex = getConf().getBoolean("doIndex", false);
int numConsumed;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-normalize")) {
normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
} else if (args[i].equals("-followRedirects")) {
followRedirects = true;
} else if (args[i].equals("-checkRobotsTxt")) {
checkRobotsTxt = true;
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-doIndex")) {
doIndex = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
int firstEquals = nextOne.indexOf("=");
if (firstEquals != -1) {
k = nextOne.substring(0, firstEquals);
v = nextOne.substring(firstEquals + 1);
} else
k = nextOne;
metadata.put(k, v);
} else if ((numConsumed = super.parseArgs(args, i)) > 0) {
i += numConsumed - 1;
} else if (i != args.length - 1) {
System.err.println("ERR: Not a recognized argument: " + args[i]);
System.err.println(usage);
System.exit(-1);
} else {
url = args[i];
}
}
if (url != null) {
return super.processSingle(url);
} else {
// Start listening
return super.run();
}
}
Aggregations