Search in sources :

Example 1 with ClueExtractLinks

use of edu.umd.cloud9.webgraph.ClueExtractLinks in project Cloud9 by lintool.

the class ClueWebDriver method run.

public int run(String[] args) throws Exception {
    if (args.length < 6) {
        printUsage();
        return -1;
    }
    Configuration conf = getConf();
    String inputArg = DriverUtil.argValue(args, DriverUtil.CL_INPUT);
    final String inputBase = inputArg.endsWith("/") ? inputArg : inputArg + "/";
    String outputArg = DriverUtil.argValue(args, DriverUtil.CL_OUTPUT);
    final String outputBase = outputArg.endsWith("/") ? outputArg : outputArg + "/";
    final String docnoMapping = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING);
    final int fromSegment = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_BEGIN_SEGMENT));
    final int toSegment = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_END_SEGMENT));
    final boolean includeInternalLinks = DriverUtil.argExists(args, DriverUtil.CL_INCLUDE_INTERNAL_LINKS);
    final boolean computeAnchorWeights = DriverUtil.argExists(args, DriverUtil.CL_COMPUTE_WEIGHTS);
    final String normalizer = DriverUtil.argValue(args, DriverUtil.CL_NORMALIZER);
    conf.setInt("Cloud9.Mappers", 2000);
    conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
    conf.set("Cloud9.DocnoMappingFile", docnoMapping);
    conf.setBoolean("Cloud9.IncludeInternalLinks", includeInternalLinks);
    conf.set("Cloud9.AnchorTextNormalizer", normalizer);
    // Extract link information for each segment separately
    for (int i = fromSegment; i <= toSegment; i++) {
        String inputPath = inputBase + "en." + (i == 10 ? "10" : ("0" + i));
        String outputPath = outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en." + (i == 10 ? "10" : ("0" + i));
        conf.set("Cloud9.InputPath", inputPath);
        conf.set("Cloud9.OutputPath", outputPath);
        int r = new ClueExtractLinks(conf).run();
        if (r != 0) {
            return -1;
        }
    }
    // Construct the reverse web graph (i.e., collect incoming link
    // information)
    String inputPath = "";
    for (int i = fromSegment; i < toSegment; i++) {
        inputPath += outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en.0" + i + "/,";
    }
    if (toSegment == 10) {
        inputPath += outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en.10/";
    } else {
        inputPath += outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en.0" + toSegment + "/";
    }
    String outputPath = outputBase + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/";
    conf.set("Cloud9.InputPath", inputPath);
    conf.set("Cloud9.OutputPath", outputPath);
    conf.setInt("Cloud9.Mappers", 1);
    conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1));
    int r = new BuildReverseWebGraph(conf).run();
    if (r != 0) {
        return -1;
    }
    // Construct the web graph
    inputPath = outputBase + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/";
    outputPath = outputBase + DriverUtil.OUTPUT_WEBGRAPH + "/";
    conf.set("Cloud9.InputPath", inputPath);
    conf.set("Cloud9.OutputPath", outputPath);
    conf.setInt("Cloud9.Mappers", 1);
    conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1));
    r = new BuildWebGraph(conf).run();
    if (r != 0) {
        return -1;
    }
    if (computeAnchorWeights) {
        // Propagating domain names in order to compute anchor weights
        inputPath = outputBase + DriverUtil.OUTPUT_WEBGRAPH + "/";
        outputPath = outputBase + DriverUtil.OUTPUT_HOST_NAMES + "/";
        conf.set("Cloud9.InputPath", inputPath);
        conf.set("Cloud9.OutputPath", outputPath);
        conf.setInt("Cloud9.Mappers", 1);
        conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1));
        r = new CollectHostnames(conf).run();
        if (r != 0) {
            return -1;
        }
        // Compute the weights
        inputPath = outputBase + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/," + outputBase + DriverUtil.OUTPUT_HOST_NAMES + "/";
        outputPath = outputBase + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/";
        conf.set("Cloud9.InputPath", inputPath);
        conf.set("Cloud9.OutputPath", outputPath);
        conf.setInt("Cloud9.Mappers", 1);
        conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1));
        r = new ComputeWeight(conf).run();
        if (r != 0) {
            return -1;
        }
    }
    return 0;
}
Also used : ClueExtractLinks(edu.umd.cloud9.webgraph.ClueExtractLinks) BuildReverseWebGraph(edu.umd.cloud9.webgraph.BuildReverseWebGraph) ComputeWeight(edu.umd.cloud9.webgraph.ComputeWeight) Configuration(org.apache.hadoop.conf.Configuration) CollectHostnames(edu.umd.cloud9.webgraph.CollectHostnames) BuildWebGraph(edu.umd.cloud9.webgraph.BuildWebGraph)

Aggregations

BuildReverseWebGraph (edu.umd.cloud9.webgraph.BuildReverseWebGraph)1 BuildWebGraph (edu.umd.cloud9.webgraph.BuildWebGraph)1 ClueExtractLinks (edu.umd.cloud9.webgraph.ClueExtractLinks)1 CollectHostnames (edu.umd.cloud9.webgraph.CollectHostnames)1 ComputeWeight (edu.umd.cloud9.webgraph.ComputeWeight)1 Configuration (org.apache.hadoop.conf.Configuration)1