Search in sources :

Example 1 with TrecExtractLinks

use of edu.umd.cloud9.webgraph.TrecExtractLinks in project Cloud9 by lintool.

the class TrecDriver method run.

public int run(String[] args) throws Exception {
    conf = getConf();
    configer = new CollectionConfigurationManager();
    if (!readInput(args)) {
        printUsage();
        return -1;
    }
    configer.applyConfig(conf);
    conf.setInt("Cloud9.Mappers", 2000);
    conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
    conf.setBoolean("Cloud9.IncludeInternalLinks", includeInternalLinks);
    conf.set("Cloud9.AnchorTextNormalizer", normalizer);
    // Job 1:
    // Extract link information for each segment separately
    String inputPath = inputBase;
    String outputPath = outputBase + "/" + DriverUtil.OUTPUT_EXTRACT_LINKS;
    conf.set("Cloud9.InputPath", inputPath);
    conf.set("Cloud9.OutputPath", outputPath);
    int r = new TrecExtractLinks(conf, configer).run();
    if (r != 0) {
        return -1;
    }
    // Job 2:
    // Construct the reverse web graph (i.e., collect incoming link
    // information)
    inputPath = outputBase + "/" + DriverUtil.OUTPUT_EXTRACT_LINKS;
    outputPath = outputBase + "/" + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/";
    conf.set("Cloud9.InputPath", inputPath);
    conf.set("Cloud9.OutputPath", outputPath);
    conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
    r = new BuildReverseWebGraph(conf).run();
    if (r != 0) {
        return -1;
    }
    // Job 3:
    // Construct the web graph
    inputPath = outputBase + "/" + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/";
    outputPath = outputBase + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/";
    conf.set("Cloud9.InputPath", inputPath);
    conf.set("Cloud9.OutputPath", outputPath);
    conf.setInt("Cloud9.Mappers", 1);
    conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
    r = new BuildWebGraph(conf).run();
    if (r != 0) {
        return -1;
    }
    if (computeAnchorWeights) {
        // Propagating domain names in order to compute anchor weights
        inputPath = outputBase + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/";
        outputPath = outputBase + "/" + DriverUtil.OUTPUT_HOST_NAMES + "/";
        conf.set("Cloud9.InputPath", inputPath);
        conf.set("Cloud9.OutputPath", outputPath);
        conf.setInt("Cloud9.Mappers", 1);
        conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
        r = new CollectHostnames(conf).run();
        if (r != 0) {
            return -1;
        }
        // Compute the weights
        inputPath = outputBase + "/" + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/," + outputBase + "/" + DriverUtil.OUTPUT_HOST_NAMES + "/";
        outputPath = outputBase + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/";
        conf.set("Cloud9.InputPath", inputPath);
        conf.set("Cloud9.OutputPath", outputPath);
        conf.setInt("Cloud9.Mappers", 1);
        conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
        r = new ComputeWeight(conf).run();
        if (r != 0) {
            return -1;
        }
    }
    return 0;
}
Also used : BuildReverseWebGraph(edu.umd.cloud9.webgraph.BuildReverseWebGraph) ComputeWeight(edu.umd.cloud9.webgraph.ComputeWeight) CollectHostnames(edu.umd.cloud9.webgraph.CollectHostnames) TrecExtractLinks(edu.umd.cloud9.webgraph.TrecExtractLinks) BuildWebGraph(edu.umd.cloud9.webgraph.BuildWebGraph) CollectionConfigurationManager(edu.umd.cloud9.webgraph.CollectionConfigurationManager)

Aggregations

BuildReverseWebGraph (edu.umd.cloud9.webgraph.BuildReverseWebGraph)1 BuildWebGraph (edu.umd.cloud9.webgraph.BuildWebGraph)1 CollectHostnames (edu.umd.cloud9.webgraph.CollectHostnames)1 CollectionConfigurationManager (edu.umd.cloud9.webgraph.CollectionConfigurationManager)1 ComputeWeight (edu.umd.cloud9.webgraph.ComputeWeight)1 TrecExtractLinks (edu.umd.cloud9.webgraph.TrecExtractLinks)1