use of org.apache.hadoop.io.Writable in project Cloud9 by lintool.
the class RunPageRankSchimmy method phase1.
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws Exception {
Configuration conf = getConf();
String in = path + "/iter" + FORMAT.format(i);
String out = path + "/iter" + FORMAT.format(j) + "t";
String outm = out + "-mass";
FileSystem fs = FileSystem.get(conf);
// We need to actually count the number of part files to get the number
// of partitions (because the directory might contain _log).
int numPartitions = 0;
for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
if (s.getPath().getName().contains("part-")) {
numPartitions++;
}
}
conf.setInt("NodeCount", n);
Partitioner<IntWritable, Writable> p = null;
if (useRange) {
p = new RangePartitioner();
((Configurable) p).setConf(conf);
} else {
p = new HashPartitioner<IntWritable, Writable>();
}
// This is really annoying: the mapping between the partition numbers on
// disk (i.e., part-XXXX) and what partition the file contains (i.e.,
// key.hash % #reducer) is arbitrary... so this means that we need to
// open up each partition, peek inside to find out.
IntWritable key = new IntWritable();
PageRankNode value = new PageRankNode();
FileStatus[] status = fs.listStatus(new Path(in));
StringBuilder sb = new StringBuilder();
for (FileStatus f : status) {
if (!f.getPath().getName().contains("part-")) {
continue;
}
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));
reader.next(key, value);
int np = p.getPartition(key, value, numPartitions);
reader.close();
LOG.info(f.getPath() + "\t" + np);
sb.append(np + "=" + f.getPath() + ";");
}
LOG.info(sb.toString().trim());
LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
LOG.info(" - input: " + in);
LOG.info(" - output: " + out);
LOG.info(" - nodeCnt: " + n);
LOG.info(" - useCombiner: " + useCombiner);
LOG.info(" - useInmapCombiner: " + useInmapCombiner);
LOG.info(" - numPartitions: " + numPartitions);
LOG.info(" - useRange: " + useRange);
LOG.info("computed number of partitions: " + numPartitions);
int numReduceTasks = numPartitions;
conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
//conf.set("mapred.child.java.opts", "-Xmx2048m");
conf.set("PageRankMassPath", outm);
conf.set("BasePath", in);
conf.set("PartitionMapping", sb.toString().trim());
conf.setBoolean("mapred.map.tasks.speculative.execution", false);
conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
Job job = Job.getInstance(conf);
job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
job.setJarByClass(RunPageRankSchimmy.class);
job.setNumReduceTasks(numReduceTasks);
FileInputFormat.setInputPaths(job, new Path(in));
FileOutputFormat.setOutputPath(job, new Path(out));
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(FloatWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(PageRankNode.class);
if (useInmapCombiner) {
job.setMapperClass(MapWithInMapperCombiningClass.class);
} else {
job.setMapperClass(MapClass.class);
}
if (useCombiner) {
job.setCombinerClass(CombineClass.class);
}
if (useRange) {
job.setPartitionerClass(RangePartitioner.class);
}
job.setReducerClass(ReduceClass.class);
FileSystem.get(conf).delete(new Path(out), true);
FileSystem.get(conf).delete(new Path(outm), true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
float mass = Float.NEGATIVE_INFINITY;
for (FileStatus f : fs.listStatus(new Path(outm))) {
FSDataInputStream fin = fs.open(f.getPath());
mass = sumLogProbs(mass, fin.readFloat());
fin.close();
}
return mass;
}
use of org.apache.hadoop.io.Writable in project Cloud9 by lintool.
the class FileMerger method mergeSequenceFiles.
private static Path mergeSequenceFiles(Configuration configuration, String inputFiles, String outputFile, Class<? extends Writable> keyClass, Class<? extends Writable> valueClass, boolean deleteSource, boolean deleteDestinationFileIfExist) throws IOException, InstantiationException, IllegalAccessException {
JobConf conf = new JobConf(configuration, FileMerger.class);
FileSystem fs = FileSystem.get(conf);
Path inputPath = new Path(inputFiles);
Path outputPath = new Path(outputFile);
if (deleteDestinationFileIfExist) {
if (fs.exists(outputPath)) {
// carefully remove the destination file, not recursive
fs.delete(outputPath, false);
sLogger.info("Warning: remove destination file since it already exists...");
}
} else {
Preconditions.checkArgument(!fs.exists(outputPath), new IOException("Destination file already exists..."));
}
FileStatus[] fileStatuses = fs.globStatus(inputPath);
SequenceFile.Reader sequenceFileReader = null;
SequenceFile.Writer sequenceFileWriter = null;
Writable key, value;
key = keyClass.newInstance();
value = valueClass.newInstance();
try {
sequenceFileWriter = new SequenceFile.Writer(fs, conf, outputPath, keyClass, valueClass);
for (FileStatus fileStatus : fileStatuses) {
sLogger.info("Openning file " + fileStatus.getPath() + "...");
sequenceFileReader = new SequenceFile.Reader(fs, fileStatus.getPath(), conf);
while (sequenceFileReader.next(key, value)) {
sequenceFileWriter.append(key, value);
}
if (deleteSource) {
fs.deleteOnExit(fileStatus.getPath());
}
}
} finally {
IOUtils.closeStream(sequenceFileReader);
IOUtils.closeStream(sequenceFileWriter);
}
sLogger.info("Successfully merge " + inputPath.toString() + " to " + outputFile);
return outputPath;
}
use of org.apache.hadoop.io.Writable in project Cloud9 by lintool.
the class ReadSequenceFile method readSequenceFile.
private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
System.out.println("Reading " + path + "...\n");
try {
System.out.println("Key type: " + reader.getKeyClass().toString());
System.out.println("Value type: " + reader.getValueClass().toString() + "\n");
} catch (Exception e) {
throw new RuntimeException("Error: loading key/value class");
}
Writable key, value;
int n = 0;
try {
if (Tuple.class.isAssignableFrom(reader.getKeyClass())) {
key = TUPLE_FACTORY.newTuple();
} else {
key = (Writable) reader.getKeyClass().newInstance();
}
if (Tuple.class.isAssignableFrom(reader.getValueClass())) {
value = TUPLE_FACTORY.newTuple();
} else {
value = (Writable) reader.getValueClass().newInstance();
}
while (reader.next(key, value)) {
System.out.println("Record " + n);
System.out.println("Key: " + key + "\nValue: " + value);
System.out.println("----------------------------------------");
n++;
if (n >= max)
break;
}
reader.close();
System.out.println(n + " records read.\n");
} catch (Exception e) {
e.printStackTrace();
}
return n;
}
use of org.apache.hadoop.io.Writable in project SQLWindowing by hbutani.
the class IOUtils method createFileWindowingInput.
@SuppressWarnings("unchecked")
public static WindowingInput createFileWindowingInput(String path, String inputFormatClassName, String serDeClassName, Properties serDeProperties, Configuration conf) throws WindowingException {
try {
HiveConf hConf = new HiveConf(conf, IOUtils.class);
JobConf job = new JobConf(hConf);
Path p = new Path(path);
p = makeQualified(p, conf);
Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(inputFormatClassName);
hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
if (iFmt instanceof TextInputFormat) {
((TextInputFormat) iFmt).configure(job);
}
FileInputFormat.addInputPath(job, p);
InputSplit[] iSplits = iFmt.getSplits(job, 1);
org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
hConf.set(INPUT_PATH, path);
hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
hConf.set(INPUT_SERDE_CLASS, serDeClassName);
TableWindowingInput tIn = new TableWindowingInput();
tIn.initialize(null, hConf, serDeProperties);
return tIn;
} catch (Exception e) {
throw new WindowingException(e);
}
}
use of org.apache.hadoop.io.Writable in project SQLWindowing by hbutani.
the class HiveUtils method addTableasJobInput.
@SuppressWarnings("unchecked")
public static List<FieldSchema> addTableasJobInput(String db, String table, JobConf job, FileSystem fs) throws WindowingException {
LOG.info("HiveUtils::addTableasJobInput invoked");
try {
HiveMetaStoreClient client = getClient(job);
// 1. get Table details from Hive metastore
db = validateDB(client, db);
Table t = getTable(client, db, table);
StorageDescriptor sd = t.getSd();
// 2. add table's location to job input
FileInputFormat.addInputPath(job, new Path(sd.getLocation()));
// 3. set job inputFormatClass, extract from StorageDescriptor
Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(sd.getInputFormat());
job.setInputFormat(inputFormatClass);
return client.getFields(db, table);
} catch (WindowingException w) {
throw w;
} catch (Exception e) {
throw new WindowingException(e);
}
}
Aggregations