Search in sources :

Example 1 with URLNormalizers

use of org.apache.nutch.net.URLNormalizers in project nutch by apache.

the class URLPartitioner method configure.

public void configure(Job job) {
    Configuration conf = job.getConfiguration();
    seed = conf.getInt("partition.url.seed", 0);
    mode = conf.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
    // check that the mode is known
    if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN) && !mode.equals(PARTITION_MODE_HOST)) {
        LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
        mode = PARTITION_MODE_HOST;
    }
    normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_PARTITION);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) URLNormalizers(org.apache.nutch.net.URLNormalizers)

Example 2 with URLNormalizers

use of org.apache.nutch.net.URLNormalizers in project nutch by apache.

the class ParserChecker method run.

@Override
public int run(String[] args) throws Exception {
    String url = null;
    String usage = // 
    "Usage:\n" + // 
    "  ParserChecker [OPTIONS] <url>\n" + // 
    "    Fetch single URL and parse it\n" + // 
    "  ParserChecker [OPTIONS] -stdin\n" + // 
    "    Read URLs to be parsed from stdin\n" + // 
    "  ParserChecker [OPTIONS] -listen <port> [-keepClientCnxOpen]\n" + // 
    "    Listen on <port> for URLs to be parsed\n" + // 
    "Options:\n" + // 
    "  -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" + // 
    "                  \t(a generic Hadoop option to be passed\n" + "                  \t before other command-specific options)\n" + // 
    "  -normalize      \tnormalize URLs\n" + // 
    "  -followRedirects\tfollow redirects when fetching URL\n" + // 
    "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" + // 
    "  -dumpText       \talso show the plain-text extracted by parsers\n" + // 
    "  -forceAs <mimeType>\tforce parsing as <mimeType>\n" + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
    // Print help when no args given
    if (args.length < 1) {
        System.err.println(usage);
        System.exit(-1);
    }
    // initialize plugins early to register URL stream handlers to support
    // custom protocol implementations
    PluginRepository.get(getConf());
    int numConsumed;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-normalize")) {
            normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
        } else if (args[i].equals("-followRedirects")) {
            followRedirects = true;
        } else if (args[i].equals("-checkRobotsTxt")) {
            checkRobotsTxt = true;
        } else if (args[i].equals("-forceAs")) {
            forceAsContentType = args[++i];
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if ((numConsumed = super.parseArgs(args, i)) > 0) {
            i += numConsumed - 1;
        } else if (i != args.length - 1) {
            System.err.println("ERR: Not a recognized argument: " + args[i]);
            System.err.println(usage);
            System.exit(-1);
        } else {
            url = args[i];
        }
    }
    scfilters = new ScoringFilters(getConf());
    if (url != null) {
        return super.processSingle(url);
    } else {
        // Start listening
        return super.run();
    }
}
Also used : ScoringFilters(org.apache.nutch.scoring.ScoringFilters) URLNormalizers(org.apache.nutch.net.URLNormalizers)

Example 3 with URLNormalizers

use of org.apache.nutch.net.URLNormalizers in project nutch by apache.

the class FeedParser method setConf.

/**
 * Sets the {@link Configuration} object for this {@link Parser}. This
 * {@link Parser} expects the following configuration properties to be set:
 *
 * <ul>
 * <li>URLNormalizers - properties in the configuration object to set up the
 * default url normalizers.</li>
 * <li>URLFilters - properties in the configuration object to set up the
 * default url filters.</li>
 * </ul>
 *
 * @param conf
 *          The Hadoop {@link Configuration} object to use to configure this
 *          {@link Parser}.
 */
@Override
public void setConf(Configuration conf) {
    this.conf = conf;
    this.parserFactory = new ParserFactory(conf);
    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    this.filters = new URLFilters(conf);
    this.defaultEncoding = conf.get("parser.character.encoding.default", "windows-1252");
}
Also used : ParserFactory(org.apache.nutch.parse.ParserFactory) URLFilters(org.apache.nutch.net.URLFilters) URLNormalizers(org.apache.nutch.net.URLNormalizers)

Example 4 with URLNormalizers

use of org.apache.nutch.net.URLNormalizers in project nutch by apache.

the class URLPartitioner method setConf.

@Override
public void setConf(Configuration conf) {
    this.conf = conf;
    seed = conf.getInt("partition.url.seed", 0);
    mode = conf.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
    // check that the mode is known
    if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN) && !mode.equals(PARTITION_MODE_HOST)) {
        LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
        mode = PARTITION_MODE_HOST;
    }
    normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_PARTITION);
}
Also used : URLNormalizers(org.apache.nutch.net.URLNormalizers)

Example 5 with URLNormalizers

use of org.apache.nutch.net.URLNormalizers in project nutch by apache.

the class IndexingFiltersChecker method run.

@Override
public int run(String[] args) throws Exception {
    String url = null;
    String usage = // 
    "Usage:\n" + // 
    "  IndexingFiltersChecker [OPTIONS] <url>\n" + // 
    "    Fetch single URL and index it\n" + // 
    "  IndexingFiltersChecker [OPTIONS] -stdin\n" + // 
    "    Read URLs to be indexed from stdin\n" + // 
    "  IndexingFiltersChecker [OPTIONS] -listen <port> [-keepClientCnxOpen]\n" + // 
    "    Listen on <port> for URLs to be indexed\n" + // 
    "Options:\n" + // 
    "  -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" + // 
    "                  \t(a generic Hadoop option to be passed\n" + "                  \t before other command-specific options)\n" + // 
    "  -normalize      \tnormalize URLs\n" + // 
    "  -followRedirects\tfollow redirects when fetching URL\n" + // 
    "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" + // "
    "  -dumpText       \tshow the entire plain-text content,\n" + // 
    "                  \tnot only the first 100 characters\n" + // 
    "  -doIndex        \tpass document to configured index writers\n" + // 
    "                  \tand let them index it\n" + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
    // Print help when no args given
    if (args.length < 1) {
        System.err.println(usage);
        System.exit(-1);
    }
    // read property "doIndex" for back-ward compatibility
    doIndex = getConf().getBoolean("doIndex", false);
    int numConsumed;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-normalize")) {
            normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
        } else if (args[i].equals("-followRedirects")) {
            followRedirects = true;
        } else if (args[i].equals("-checkRobotsTxt")) {
            checkRobotsTxt = true;
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-doIndex")) {
            doIndex = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if ((numConsumed = super.parseArgs(args, i)) > 0) {
            i += numConsumed - 1;
        } else if (i != args.length - 1) {
            System.err.println("ERR: Not a recognized argument: " + args[i]);
            System.err.println(usage);
            System.exit(-1);
        } else {
            url = args[i];
        }
    }
    if (url != null) {
        return super.processSingle(url);
    } else {
        // Start listening
        return super.run();
    }
}
Also used : URLNormalizers(org.apache.nutch.net.URLNormalizers)

Aggregations

URLNormalizers (org.apache.nutch.net.URLNormalizers)9 Configuration (org.apache.hadoop.conf.Configuration)5 URLFilters (org.apache.nutch.net.URLFilters)5 ScoringFilters (org.apache.nutch.scoring.ScoringFilters)2 IOException (java.io.IOException)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 Entry (java.util.Map.Entry)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 MapFile (org.apache.hadoop.io.MapFile)1 Option (org.apache.hadoop.io.MapFile.Writer.Option)1 MapWritable (org.apache.hadoop.io.MapWritable)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 CompressionType (org.apache.hadoop.io.SequenceFile.CompressionType)1 Metadata (org.apache.hadoop.io.SequenceFile.Metadata)1 Text (org.apache.hadoop.io.Text)1 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)1 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)1