use of org.apache.hadoop.io.WritableComparable in project Cloud9 by lintool.
the class CreateMetadata method GenerateMetadata.
public static void GenerateMetadata(Path bitextPath, Path resultPath) throws IOException {
System.out.println(bitextPath.toString());
JobConf conf = new JobConf(CreateMetadata.class);
FileSystem fileSys = FileSystem.get(conf);
// SequenceFile.Reader[] x = SequenceFileOutputFormat.getReaders(conf, bitextPath);
SequenceFile.Reader[] x = SequenceFileOutputFormat.getReaders(conf, new Path("/shared/bitexts/ar-en.ldc.10k/ar-en.10k.bitext"));
WritableComparable key = new IntWritable();
PhrasePair value = new PhrasePair();
int sc = 0;
int ec = 0;
int fc = 0;
try {
for (SequenceFile.Reader r : x) while (r.next(key, value)) {
sc = sc + 1;
for (int word : value.getE().getWords()) if (word > ec)
ec = word;
for (int word : value.getF().getWords()) if (word > fc)
fc = word;
}
} catch (IOException e) {
throw new RuntimeException("IO exception: " + e.getMessage());
}
Metadata theMetadata = new Metadata(sc, ec, fc);
ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream(FileSystem.get(conf).create(resultPath)));
mdstream.writeObject(theMetadata);
mdstream.close();
}
use of org.apache.hadoop.io.WritableComparable in project Cloud9 by lintool.
the class HubsAndAuthoritiesSchimmy method HACalc.
public int HACalc(String path, int iter, int jter, int nodeCount, boolean useCombiner, boolean useInmapCombiner, boolean useRange, int mapTasks, int reduceTasks) throws IOException {
JobConf conf = new JobConf(HubsAndAuthoritiesSchimmy.class);
String inputPath = path + "/iter" + sFormat.format(iter);
String outputPath = path + "/iter" + sFormat.format(jter) + "t";
FileSystem fs = FileSystem.get(conf);
// int numPartitions = FileSystem.get(conf).listStatus(new
// Path(inputPath)).length - 1;
// we need to actually count the number of part files to get the number
// of partitions (because the directory might contain _log)
int numPartitions = 0;
for (FileStatus s : FileSystem.get(conf).listStatus(new Path(inputPath))) {
if (s.getPath().getName().contains("part-"))
numPartitions++;
}
conf.setInt("NodeCount", nodeCount);
Partitioner p = null;
if (useRange) {
p = new RangePartitioner<IntWritable, Writable>();
p.configure(conf);
} else {
p = new HashPartitioner<WritableComparable, Writable>();
}
// this is really annoying: the mapping between the partition numbers on
// disk (i.e., part-XXXX) and what partition the file contains (i.e.,
// key.hash % #reducer) is arbitrary... so this means that we need to
// open up each partition, peek inside to find out.
IntWritable key = new IntWritable();
HITSNode value = new HITSNode();
FileStatus[] status = fs.listStatus(new Path(inputPath));
StringBuilder sb = new StringBuilder();
for (FileStatus f : status) {
if (f.getPath().getName().contains("_logs"))
continue;
SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);
reader.next(key, value);
@SuppressWarnings("unchecked") int np = p.getPartition(key, value, numPartitions);
reader.close();
sLogger.info(f.getPath() + "\t" + np);
sb.append(np + "=" + f.getPath() + "\t");
}
sLogger.info(sb.toString().trim());
sLogger.info("Tool: HubsAndAuthorities");
sLogger.info(" - iteration: " + iter);
sLogger.info(" - number of mappers: " + mapTasks);
sLogger.info(" - number of reducers: " + reduceTasks);
conf.setJobName("Iter" + iter + "HubsAndAuthorities");
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(HITSNode.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
if (useInmapCombiner == true) {
conf.setMapperClass(HAMapperIMC.class);
} else {
conf.setMapperClass(HAMapper.class);
}
if (useRange == true) {
conf.setPartitionerClass(RangePartitioner.class);
}
conf.setReducerClass(HAReducer.class);
conf.setInt("jobIter", iter);
conf.setInt("NodeCount", nodeCount);
conf.set("PartitionMapping", sb.toString().trim());
// Delete the output directory if it exists already
Path outputDir = new Path(outputPath);
FileSystem.get(conf).delete(outputDir, true);
long startTime = System.currentTimeMillis();
JobClient.runJob(conf);
sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
use of org.apache.hadoop.io.WritableComparable in project druid by druid-io.
the class OrcStructConverter method convertField.
/**
* Convert a orc struct field as though it were a map, by fieldIndex. Complex types will be transformed
* into java lists and maps when possible ({@link OrcStructConverter#convertList} and
* {@link OrcStructConverter#convertMap}), and
* primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally,
* if a field is not present, this method will return null.
*
* Note: "Union" types are not currently supported and will be returned as null
*/
@Nullable
Object convertField(OrcStruct struct, int fieldIndex) {
if (fieldIndex < 0) {
return null;
}
TypeDescription schema = struct.getSchema();
TypeDescription fieldDescription = schema.getChildren().get(fieldIndex);
WritableComparable fieldValue = struct.getFieldValue(fieldIndex);
if (fieldValue == null) {
return null;
}
if (fieldDescription.getCategory().isPrimitive()) {
return convertPrimitive(fieldDescription, fieldValue, binaryAsString);
} else {
/*
ORC TYPE WRITABLE TYPE
array org.apache.orc.mapred.OrcList
map org.apache.orc.mapred.OrcMap
struct org.apache.orc.mapred.OrcStruct
uniontype org.apache.orc.mapred.OrcUnion
*/
switch(fieldDescription.getCategory()) {
case LIST:
OrcList orcList = (OrcList) fieldValue;
return convertList(fieldDescription, orcList, binaryAsString);
case MAP:
OrcMap map = (OrcMap) fieldValue;
return convertMap(fieldDescription, map, binaryAsString);
case STRUCT:
OrcStruct structMap = (OrcStruct) fieldValue;
return convertStructToMap(structMap);
case UNION:
// sorry union types :(
default:
return null;
}
}
}
use of org.apache.hadoop.io.WritableComparable in project druid by druid-io.
the class OrcStructConverter method convertMap.
private static Map<Object, Object> convertMap(TypeDescription fieldDescription, OrcMap<? extends WritableComparable, ? extends WritableComparable> map, boolean binaryAsString) {
Map<Object, Object> converted = new HashMap<>();
TypeDescription keyDescription = fieldDescription.getChildren().get(0);
TypeDescription valueDescription = fieldDescription.getChildren().get(1);
for (WritableComparable key : map.navigableKeySet()) {
Object newKey = convertPrimitive(keyDescription, key, binaryAsString);
if (valueDescription.getCategory().isPrimitive()) {
converted.put(newKey, convertPrimitive(valueDescription, map.get(key), binaryAsString));
} else {
converted.put(newKey, map.get(key));
}
}
return converted;
}
Aggregations