use of com.tdunning.plume.types.PTableType in project Plume by tdunning.
the class MapRedExecutor method getMapRed.
/**
* This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of
* the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
*
* @param mscr The MSCR to convert
* @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
* @param outputPath The output path of the MapRed job
* @return A hadoop-executable MapRed Job
*
* @throws IOException
*/
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException {
Configuration conf = new Configuration();
conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
conf.setInt(MSCR_ID, mscr.getId());
conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);
// TODO deprecation
Job job = new Job(conf, "MSCR");
job.setMapOutputKeyClass(PlumeObject.class);
job.setMapOutputValueClass(PlumeObject.class);
job.setJarByClass(MapRedExecutor.class);
/**
* Define multiple inputs
*/
for (PCollection<?> input : mscr.getInputs()) {
if (!(input instanceof LazyCollection)) {
throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
}
LazyCollection<Text> l = (LazyCollection<Text>) input;
if (!(l.isMaterialized() && l.getFile() != null)) {
// Collections have plume ID only if they are intermediate results - TODO better naming for this
if (l.getPlumeId().length() < 1) {
throw new IllegalArgumentException("Can't create MapRed from MSCR inputs that are not materialized to a file");
}
}
PCollectionType<?> rType = l.getType();
Class<? extends InputFormat> format = SequenceFileInputFormat.class;
if (rType instanceof PTableType) {
PTableType<?, ?> tType = (PTableType<?, ?>) rType;
if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
format = KeyValueTextInputFormat.class;
}
MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
} else {
if (rType.elementType() instanceof StringType) {
format = TextInputFormat.class;
}
MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
}
}
/**
* Define multiple outputs
*/
FileOutputFormat.setOutputPath(job, new Path(outputPath));
for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output).getType();
if (rType instanceof PTableType) {
PTableType<?, ?> tType = (PTableType<?, ?>) rType;
Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
outputFormat = TextOutputFormat.class;
}
MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
} else {
Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
if (rType.elementType() instanceof StringType) {
outputFormat = TextOutputFormat.class;
}
MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType()));
}
}
/**
* Define Reducer & Combiner
*/
job.setCombinerClass(MSCRCombiner.class);
job.setReducerClass(MSCRReducer.class);
job.setNumReduceTasks(1);
return job;
}
use of com.tdunning.plume.types.PTableType in project Plume by tdunning.
the class MSCRMapper method map.
@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
LazyCollection<?> l = null;
FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
// Get LazyCollection for this input (according to FileSplit)
for (PCollection<?> input : mscr.getInputs()) {
LazyCollection<?> thisL = (LazyCollection<?>) input;
if (thisL.getFile() == null) {
// Convention for intermediate results
thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
}
if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
l = thisL;
break;
}
}
if (l == null) {
throw new RuntimeException("Unable to match input split with any MSCR input");
}
// If this collection is a table -> process Pair, otherwise process value
PCollectionType type = l.getType();
Object toProcess = value;
if (type instanceof PTableType) {
toProcess = Pair.create(key, value);
}
for (DeferredOp op : l.getDownOps()) {
if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = ((MultipleParallelDo) op);
for (Object entry : mPDo.getDests().entrySet()) {
Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
DeferredOp childOp = null;
if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
childOp = lCol.getDownOps().get(0);
}
final Integer channel;
if (childOp != null && childOp instanceof Flatten) {
channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
} else if (childOp != null && childOp instanceof GroupByKey) {
channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
} else {
// bypass channel?
channel = mscr.getNumberedChannels().get(en.getKey());
}
if (channel == null) {
// This is not for this MSCR - just skip it
return;
}
// Call parallelDo function
en.getValue().process(toProcess, new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
} else {
context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
}
} else {
if (op instanceof Flatten) {
l = (LazyCollection) ((Flatten) op).getDest();
}
int channel = mscr.getNumberedChannels().get(l);
if (toProcess instanceof Pair) {
context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
} else {
context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
}
}
}
}
Aggregations