Search in sources :

Example 1 with PairOfStrings

use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.

the class BfsIT method testBfs.

@Test
public void testBfs() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(collectionPath));
    String[] args;
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder.class.getCanonicalName(), "-input", collectionPath.toString(), "-output_file", tmpPrefix + "-enwiki-20121201-docno.dat", "-wiki_language", "en", "-keep_all" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.RepackWikipedia.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-enwiki-20121201.block", "-mapping_file", tmpPrefix + "-enwiki-20121201-docno.dat", "-wiki_language", "en", "-compression_type", "block" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaLinkGraph.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.block", "-edges_output", tmpPrefix + "-enwiki-20121201.edges", "-adjacency_list_output", tmpPrefix + "-enwiki-20121201.adj", "-num_partitions", "10" };
    PairOfStrings out = IntegrationUtils.exec(Joiner.on(" ").join(args));
    String errorOut = out.getRightElement();
    assertTrue(errorOut.contains("EDGES=121762273"));
    assertTrue(errorOut.contains("TOTAL_VERTICES=12961996"));
    assertTrue(errorOut.contains("VERTICES_WITH_OUTLINKS=10813673"));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.EncodeBfsGraph.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.adj", "-output", tmpPrefix + "-enwiki-20121201.bfs/iter0000", "-src", "12" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    // First iteration of BFS.
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.bfs/iter0000", "-output", tmpPrefix + "-enwiki-20121201.bfs/iter0001", "-num_partitions", "10" };
    out = IntegrationUtils.exec(Joiner.on(" ").join(args));
    errorOut = out.getRightElement();
    assertTrue(errorOut.contains("ReachableInMapper=1"));
    assertTrue(errorOut.contains("ReachableInReducer=573"));
    // Second iteration of BFS.
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.bfs/iter0001", "-output", tmpPrefix + "-enwiki-20121201.bfs/iter0002", "-num_partitions", "10" };
    out = IntegrationUtils.exec(Joiner.on(" ").join(args));
    errorOut = out.getRightElement();
    assertTrue(errorOut.contains("ReachableInMapper=573"));
    assertTrue(errorOut.contains("ReachableInReducer=37733"));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) PairOfStrings(tl.lin.data.pair.PairOfStrings) Test(org.junit.Test)

Example 2 with PairOfStrings

use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.

the class AnalyzeBigramRelativeFrequency method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(AnalyzeBigramRelativeFrequency.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    System.out.println("input path: " + inputPath);
    List<PairOfWritables<PairOfStrings, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(inputPath));
    List<PairOfWritables<PairOfStrings, FloatWritable>> list1 = Lists.newArrayList();
    List<PairOfWritables<PairOfStrings, FloatWritable>> list2 = Lists.newArrayList();
    for (PairOfWritables<PairOfStrings, FloatWritable> p : pairs) {
        PairOfStrings bigram = p.getLeftElement();
        if (bigram.getLeftElement().equals("light")) {
            list1.add(p);
        }
        if (bigram.getLeftElement().equals("contain")) {
            list2.add(p);
        }
    }
    Collections.sort(list1, new Comparator<PairOfWritables<PairOfStrings, FloatWritable>>() {

        public int compare(PairOfWritables<PairOfStrings, FloatWritable> e1, PairOfWritables<PairOfStrings, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    Iterator<PairOfWritables<PairOfStrings, FloatWritable>> iter1 = Iterators.limit(list1.iterator(), 10);
    while (iter1.hasNext()) {
        PairOfWritables<PairOfStrings, FloatWritable> p = iter1.next();
        PairOfStrings bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
    }
    Collections.sort(list2, new Comparator<PairOfWritables<PairOfStrings, FloatWritable>>() {

        public int compare(PairOfWritables<PairOfStrings, FloatWritable> e1, PairOfWritables<PairOfStrings, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    Iterator<PairOfWritables<PairOfStrings, FloatWritable>> iter2 = Iterators.limit(list2.iterator(), 10);
    while (iter2.hasNext()) {
        PairOfWritables<PairOfStrings, FloatWritable> p = iter2.next();
        PairOfStrings bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FloatWritable(org.apache.hadoop.io.FloatWritable) PairOfWritables(tl.lin.data.pair.PairOfWritables) PairOfStrings(tl.lin.data.pair.PairOfStrings) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Example 3 with PairOfStrings

use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.

the class BigramRelativeFrequencyIT method testBigramRelativeFrequencyBase.

@Test
public void testBigramRelativeFrequencyBase() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    assertTrue(fs.exists(collectionPath));
    String[] args = new String[] { "hadoop --config src/test/resources/hadoop-local-conf/ jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bigram.BigramRelativeFrequency.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-base", "-numReducers", "1" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(tmpPrefix + "-base/part-r-00000")));
    PairOfStrings pair = new PairOfStrings();
    FloatWritable f = new FloatWritable();
    reader.next(pair, f);
    assertEquals("&c", pair.getLeftElement());
    assertEquals("*", pair.getRightElement());
    assertEquals(17f, f.get(), 10e-6);
    for (int i = 0; i < 100; i++) {
        reader.next(pair, f);
    }
    assertEquals("'dear", pair.getLeftElement());
    assertEquals("*", pair.getRightElement());
    assertEquals(2f, f.get(), 10e-6);
    reader.next(pair, f);
    assertEquals("'dear", pair.getLeftElement());
    assertEquals("lord", pair.getRightElement());
    assertEquals(1f, f.get(), 10e-6);
    reader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FloatWritable(org.apache.hadoop.io.FloatWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) PairOfStrings(tl.lin.data.pair.PairOfStrings) Test(org.junit.Test)

Example 4 with PairOfStrings

use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.

the class ClueWebPageRankIT method testPageRank.

@Test
public void testPageRank() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    assertTrue(fs.exists(collectionPath));
    String[] args;
    PairOfStrings pair;
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.BuildPageRankRecords.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-clueweb09en01-PageRankRecords", "-numNodes", "50220423" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    // Hash partitioning, basic
    IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.hash.basic");
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.basic/iter0000", "-numPartitions", "200", "-numNodes", "50220423" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.hash.basic", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.hash.basic/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.basic-top10", "-top", "10" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.hash.basic-top10/part-r-00000");
    assertTrue(pair.getLeftElement().contains("16073008\t-6.381"));
    assertTrue(pair.getLeftElement().contains("42722712\t-6.425"));
    assertTrue(pair.getLeftElement().contains("16073696\t-6.552"));
    assertTrue(pair.getLeftElement().contains("16073003\t-6.604"));
    assertTrue(pair.getLeftElement().contains("47345600\t-6.610"));
    // Hash partitioning, Schimmy
    IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy");
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy/iter0000", "-numPartitions", "200", "-numNodes", "50220423" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankSchimmy.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy-top10", "-top", "10" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy-top10/part-r-00000");
    assertTrue(pair.getLeftElement().contains("16073008\t-6.371"));
    assertTrue(pair.getLeftElement().contains("42722712\t-6.421"));
    assertTrue(pair.getLeftElement().contains("16073696\t-6.540"));
    assertTrue(pair.getLeftElement().contains("16073003\t-6.592"));
    assertTrue(pair.getLeftElement().contains("47345600\t-6.597"));
    // Range partitioning, basic
    IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.range.basic");
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.basic/iter0000", "-numPartitions", "200", "-numNodes", "50220423", "-range" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.range.basic", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner", "-range" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.range.basic/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.basic-top10", "-top", "10" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.range.basic-top10/part-r-00000");
    assertTrue(pair.getLeftElement().contains("16073008\t-6.381"));
    assertTrue(pair.getLeftElement().contains("42722712\t-6.425"));
    assertTrue(pair.getLeftElement().contains("16073696\t-6.552"));
    assertTrue(pair.getLeftElement().contains("16073003\t-6.604"));
    assertTrue(pair.getLeftElement().contains("47345600\t-6.610"));
    // Range partitioning, Schimmy
    IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.range.schimmy");
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy/iter0000", "-numPartitions", "200", "-numNodes", "50220423", "-range" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankSchimmy.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner", "-range" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy-top10", "-top", "10" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.range.schimmy-top10/part-r-00000");
    assertTrue(pair.getLeftElement().contains("16073008\t-6.372"));
    assertTrue(pair.getLeftElement().contains("42722712\t-6.420"));
    assertTrue(pair.getLeftElement().contains("16073696\t-6.541"));
    assertTrue(pair.getLeftElement().contains("16073003\t-6.593"));
    assertTrue(pair.getLeftElement().contains("47345600\t-6.599"));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) PairOfStrings(tl.lin.data.pair.PairOfStrings) Test(org.junit.Test)

Example 5 with PairOfStrings

use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.

the class SimplePageRankIT method testPageRank.

@Test
public void testPageRank() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);
    IntegrationUtils.exec("hadoop fs -put docs/exercises/sample-large.txt");
    assertTrue(fs.exists(collectionPath));
    String[] args;
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.BuildPageRankRecords.class.getCanonicalName(), "-input", "sample-large.txt", "-output", tmpPrefix + "-sample-large-PageRankRecords", "-numNodes", "1458" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-sample-large-PageRank");
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-sample-large-PageRankRecords", "-output", tmpPrefix + "-sample-large-PageRank/iter0000", "-numPartitions", "5", "-numNodes", "1458" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-sample-large-PageRank", "-numNodes", "1458", "-start", "0", "-end", "10", "-useCombiner" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-sample-large-PageRank/iter0010", "-output", tmpPrefix + "-sample-large-PageRank-top10", "-top", "10" };
    IntegrationUtils.exec(Joiner.on(" ").join(args));
    PairOfStrings pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-sample-large-PageRank-top10/part-r-00000");
    assertTrue(pair.getLeftElement().contains("9369084\t-4.38753"));
    assertTrue(pair.getLeftElement().contains("8669492\t-4.45486"));
    assertTrue(pair.getLeftElement().contains("12486146\t-4.77488"));
    assertTrue(pair.getLeftElement().contains("9265639\t-4.855565"));
    assertTrue(pair.getLeftElement().contains("10912914\t-4.86802"));
    IntegrationUtils.exec("hadoop fs -rm sample-large.txt");
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) PairOfStrings(tl.lin.data.pair.PairOfStrings) Test(org.junit.Test)

Aggregations

PairOfStrings (tl.lin.data.pair.PairOfStrings)5 Configuration (org.apache.hadoop.conf.Configuration)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Test (org.junit.Test)4 Path (org.apache.hadoop.fs.Path)2 FloatWritable (org.apache.hadoop.io.FloatWritable)2 CommandLine (org.apache.commons.cli.CommandLine)1 CommandLineParser (org.apache.commons.cli.CommandLineParser)1 GnuParser (org.apache.commons.cli.GnuParser)1 HelpFormatter (org.apache.commons.cli.HelpFormatter)1 Options (org.apache.commons.cli.Options)1 ParseException (org.apache.commons.cli.ParseException)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 PairOfWritables (tl.lin.data.pair.PairOfWritables)1