use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.
the class BfsIT method testBfs.
@Test
public void testBfs() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
assertTrue(fs.exists(collectionPath));
String[] args;
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder.class.getCanonicalName(), "-input", collectionPath.toString(), "-output_file", tmpPrefix + "-enwiki-20121201-docno.dat", "-wiki_language", "en", "-keep_all" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.RepackWikipedia.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-enwiki-20121201.block", "-mapping_file", tmpPrefix + "-enwiki-20121201-docno.dat", "-wiki_language", "en", "-compression_type", "block" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaLinkGraph.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.block", "-edges_output", tmpPrefix + "-enwiki-20121201.edges", "-adjacency_list_output", tmpPrefix + "-enwiki-20121201.adj", "-num_partitions", "10" };
PairOfStrings out = IntegrationUtils.exec(Joiner.on(" ").join(args));
String errorOut = out.getRightElement();
assertTrue(errorOut.contains("EDGES=121762273"));
assertTrue(errorOut.contains("TOTAL_VERTICES=12961996"));
assertTrue(errorOut.contains("VERTICES_WITH_OUTLINKS=10813673"));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.EncodeBfsGraph.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.adj", "-output", tmpPrefix + "-enwiki-20121201.bfs/iter0000", "-src", "12" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
// First iteration of BFS.
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.bfs/iter0000", "-output", tmpPrefix + "-enwiki-20121201.bfs/iter0001", "-num_partitions", "10" };
out = IntegrationUtils.exec(Joiner.on(" ").join(args));
errorOut = out.getRightElement();
assertTrue(errorOut.contains("ReachableInMapper=1"));
assertTrue(errorOut.contains("ReachableInReducer=573"));
// Second iteration of BFS.
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input", tmpPrefix + "-enwiki-20121201.bfs/iter0001", "-output", tmpPrefix + "-enwiki-20121201.bfs/iter0002", "-num_partitions", "10" };
out = IntegrationUtils.exec(Joiner.on(" ").join(args));
errorOut = out.getRightElement();
assertTrue(errorOut.contains("ReachableInMapper=573"));
assertTrue(errorOut.contains("ReachableInReducer=37733"));
}
use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.
the class AnalyzeBigramRelativeFrequency method main.
@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
System.exit(-1);
}
if (!cmdline.hasOption(INPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(AnalyzeBigramRelativeFrequency.class.getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.exit(-1);
}
String inputPath = cmdline.getOptionValue(INPUT);
System.out.println("input path: " + inputPath);
List<PairOfWritables<PairOfStrings, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(inputPath));
List<PairOfWritables<PairOfStrings, FloatWritable>> list1 = Lists.newArrayList();
List<PairOfWritables<PairOfStrings, FloatWritable>> list2 = Lists.newArrayList();
for (PairOfWritables<PairOfStrings, FloatWritable> p : pairs) {
PairOfStrings bigram = p.getLeftElement();
if (bigram.getLeftElement().equals("light")) {
list1.add(p);
}
if (bigram.getLeftElement().equals("contain")) {
list2.add(p);
}
}
Collections.sort(list1, new Comparator<PairOfWritables<PairOfStrings, FloatWritable>>() {
public int compare(PairOfWritables<PairOfStrings, FloatWritable> e1, PairOfWritables<PairOfStrings, FloatWritable> e2) {
if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
return e1.getLeftElement().compareTo(e2.getLeftElement());
}
return e2.getRightElement().compareTo(e1.getRightElement());
}
});
Iterator<PairOfWritables<PairOfStrings, FloatWritable>> iter1 = Iterators.limit(list1.iterator(), 10);
while (iter1.hasNext()) {
PairOfWritables<PairOfStrings, FloatWritable> p = iter1.next();
PairOfStrings bigram = p.getLeftElement();
System.out.println(bigram + "\t" + p.getRightElement());
}
Collections.sort(list2, new Comparator<PairOfWritables<PairOfStrings, FloatWritable>>() {
public int compare(PairOfWritables<PairOfStrings, FloatWritable> e1, PairOfWritables<PairOfStrings, FloatWritable> e2) {
if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
return e1.getLeftElement().compareTo(e2.getLeftElement());
}
return e2.getRightElement().compareTo(e1.getRightElement());
}
});
Iterator<PairOfWritables<PairOfStrings, FloatWritable>> iter2 = Iterators.limit(list2.iterator(), 10);
while (iter2.hasNext()) {
PairOfWritables<PairOfStrings, FloatWritable> p = iter2.next();
PairOfStrings bigram = p.getLeftElement();
System.out.println(bigram + "\t" + p.getRightElement());
}
}
use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.
the class BigramRelativeFrequencyIT method testBigramRelativeFrequencyBase.
@Test
public void testBigramRelativeFrequencyBase() throws Exception {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
assertTrue(fs.exists(collectionPath));
String[] args = new String[] { "hadoop --config src/test/resources/hadoop-local-conf/ jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bigram.BigramRelativeFrequency.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-base", "-numReducers", "1" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(new Path(tmpPrefix + "-base/part-r-00000")));
PairOfStrings pair = new PairOfStrings();
FloatWritable f = new FloatWritable();
reader.next(pair, f);
assertEquals("&c", pair.getLeftElement());
assertEquals("*", pair.getRightElement());
assertEquals(17f, f.get(), 10e-6);
for (int i = 0; i < 100; i++) {
reader.next(pair, f);
}
assertEquals("'dear", pair.getLeftElement());
assertEquals("*", pair.getRightElement());
assertEquals(2f, f.get(), 10e-6);
reader.next(pair, f);
assertEquals("'dear", pair.getLeftElement());
assertEquals("lord", pair.getRightElement());
assertEquals(1f, f.get(), 10e-6);
reader.close();
}
use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.
the class ClueWebPageRankIT method testPageRank.
@Test
public void testPageRank() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
assertTrue(fs.exists(collectionPath));
String[] args;
PairOfStrings pair;
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.BuildPageRankRecords.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-clueweb09en01-PageRankRecords", "-numNodes", "50220423" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
// Hash partitioning, basic
IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.hash.basic");
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.basic/iter0000", "-numPartitions", "200", "-numNodes", "50220423" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.hash.basic", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.hash.basic/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.basic-top10", "-top", "10" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.hash.basic-top10/part-r-00000");
assertTrue(pair.getLeftElement().contains("16073008\t-6.381"));
assertTrue(pair.getLeftElement().contains("42722712\t-6.425"));
assertTrue(pair.getLeftElement().contains("16073696\t-6.552"));
assertTrue(pair.getLeftElement().contains("16073003\t-6.604"));
assertTrue(pair.getLeftElement().contains("47345600\t-6.610"));
// Hash partitioning, Schimmy
IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy");
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy/iter0000", "-numPartitions", "200", "-numNodes", "50220423" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankSchimmy.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy-top10", "-top", "10" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy-top10/part-r-00000");
assertTrue(pair.getLeftElement().contains("16073008\t-6.371"));
assertTrue(pair.getLeftElement().contains("42722712\t-6.421"));
assertTrue(pair.getLeftElement().contains("16073696\t-6.540"));
assertTrue(pair.getLeftElement().contains("16073003\t-6.592"));
assertTrue(pair.getLeftElement().contains("47345600\t-6.597"));
// Range partitioning, basic
IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.range.basic");
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.basic/iter0000", "-numPartitions", "200", "-numNodes", "50220423", "-range" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.range.basic", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner", "-range" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.range.basic/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.basic-top10", "-top", "10" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.range.basic-top10/part-r-00000");
assertTrue(pair.getLeftElement().contains("16073008\t-6.381"));
assertTrue(pair.getLeftElement().contains("42722712\t-6.425"));
assertTrue(pair.getLeftElement().contains("16073696\t-6.552"));
assertTrue(pair.getLeftElement().contains("16073003\t-6.604"));
assertTrue(pair.getLeftElement().contains("47345600\t-6.610"));
// Range partitioning, Schimmy
IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.range.schimmy");
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy/iter0000", "-numPartitions", "200", "-numNodes", "50220423", "-range" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankSchimmy.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner", "-range" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy-top10", "-top", "10" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.range.schimmy-top10/part-r-00000");
assertTrue(pair.getLeftElement().contains("16073008\t-6.372"));
assertTrue(pair.getLeftElement().contains("42722712\t-6.420"));
assertTrue(pair.getLeftElement().contains("16073696\t-6.541"));
assertTrue(pair.getLeftElement().contains("16073003\t-6.593"));
assertTrue(pair.getLeftElement().contains("47345600\t-6.599"));
}
use of tl.lin.data.pair.PairOfStrings in project Cloud9 by lintool.
the class SimplePageRankIT method testPageRank.
@Test
public void testPageRank() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
IntegrationUtils.exec("hadoop fs -put docs/exercises/sample-large.txt");
assertTrue(fs.exists(collectionPath));
String[] args;
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.BuildPageRankRecords.class.getCanonicalName(), "-input", "sample-large.txt", "-output", tmpPrefix + "-sample-large-PageRankRecords", "-numNodes", "1458" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-sample-large-PageRank");
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-sample-large-PageRankRecords", "-output", tmpPrefix + "-sample-large-PageRank/iter0000", "-numPartitions", "5", "-numNodes", "1458" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-sample-large-PageRank", "-numNodes", "1458", "-start", "0", "-end", "10", "-useCombiner" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-sample-large-PageRank/iter0010", "-output", tmpPrefix + "-sample-large-PageRank-top10", "-top", "10" };
IntegrationUtils.exec(Joiner.on(" ").join(args));
PairOfStrings pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-sample-large-PageRank-top10/part-r-00000");
assertTrue(pair.getLeftElement().contains("9369084\t-4.38753"));
assertTrue(pair.getLeftElement().contains("8669492\t-4.45486"));
assertTrue(pair.getLeftElement().contains("12486146\t-4.77488"));
assertTrue(pair.getLeftElement().contains("9265639\t-4.855565"));
assertTrue(pair.getLeftElement().contains("10912914\t-4.86802"));
IntegrationUtils.exec("hadoop fs -rm sample-large.txt");
}
Aggregations