Search in sources :

Example 31 with WritableComparable

use of org.apache.hadoop.io.WritableComparable in project Cloud9 by lintool.

the class CreateMetadata method GenerateMetadata.

public static void GenerateMetadata(Path bitextPath, Path resultPath) throws IOException {
    System.out.println(bitextPath.toString());
    JobConf conf = new JobConf(CreateMetadata.class);
    FileSystem fileSys = FileSystem.get(conf);
    // SequenceFile.Reader[] x = SequenceFileOutputFormat.getReaders(conf, bitextPath);
    SequenceFile.Reader[] x = SequenceFileOutputFormat.getReaders(conf, new Path("/shared/bitexts/ar-en.ldc.10k/ar-en.10k.bitext"));
    WritableComparable key = new IntWritable();
    PhrasePair value = new PhrasePair();
    int sc = 0;
    int ec = 0;
    int fc = 0;
    try {
        for (SequenceFile.Reader r : x) while (r.next(key, value)) {
            sc = sc + 1;
            for (int word : value.getE().getWords()) if (word > ec)
                ec = word;
            for (int word : value.getF().getWords()) if (word > fc)
                fc = word;
        }
    } catch (IOException e) {
        throw new RuntimeException("IO exception: " + e.getMessage());
    }
    Metadata theMetadata = new Metadata(sc, ec, fc);
    ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream(FileSystem.get(conf).create(resultPath)));
    mdstream.writeObject(theMetadata);
    mdstream.close();
}
Also used : Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) ObjectOutputStream(java.io.ObjectOutputStream) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) BufferedOutputStream(java.io.BufferedOutputStream) IntWritable(org.apache.hadoop.io.IntWritable)

Example 32 with WritableComparable

use of org.apache.hadoop.io.WritableComparable in project Cloud9 by lintool.

the class HubsAndAuthoritiesSchimmy method HACalc.

public int HACalc(String path, int iter, int jter, int nodeCount, boolean useCombiner, boolean useInmapCombiner, boolean useRange, int mapTasks, int reduceTasks) throws IOException {
    JobConf conf = new JobConf(HubsAndAuthoritiesSchimmy.class);
    String inputPath = path + "/iter" + sFormat.format(iter);
    String outputPath = path + "/iter" + sFormat.format(jter) + "t";
    FileSystem fs = FileSystem.get(conf);
    // int numPartitions = FileSystem.get(conf).listStatus(new
    // Path(inputPath)).length - 1;
    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(inputPath))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;
    }
    conf.setInt("NodeCount", nodeCount);
    Partitioner p = null;
    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }
    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    HITSNode value = new HITSNode();
    FileStatus[] status = fs.listStatus(new Path(inputPath));
    StringBuilder sb = new StringBuilder();
    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);
        reader.next(key, value);
        @SuppressWarnings("unchecked") int np = p.getPartition(key, value, numPartitions);
        reader.close();
        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }
    sLogger.info(sb.toString().trim());
    sLogger.info("Tool: HubsAndAuthorities");
    sLogger.info(" - iteration: " + iter);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);
    conf.setJobName("Iter" + iter + "HubsAndAuthorities");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    if (useInmapCombiner == true) {
        conf.setMapperClass(HAMapperIMC.class);
    } else {
        conf.setMapperClass(HAMapper.class);
    }
    if (useRange == true) {
        conf.setPartitionerClass(RangePartitioner.class);
    }
    conf.setReducerClass(HAReducer.class);
    conf.setInt("jobIter", iter);
    conf.setInt("NodeCount", nodeCount);
    conf.set("PartitionMapping", sb.toString().trim());
    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Writable(org.apache.hadoop.io.Writable) ArrayListOfIntsWritable(tl.lin.data.array.ArrayListOfIntsWritable) IntWritable(org.apache.hadoop.io.IntWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) HashPartitioner(org.apache.hadoop.mapred.lib.HashPartitioner) Partitioner(org.apache.hadoop.mapred.Partitioner) IntWritable(org.apache.hadoop.io.IntWritable)

Example 33 with WritableComparable

use of org.apache.hadoop.io.WritableComparable in project druid by druid-io.

the class OrcStructConverter method convertField.

/**
 * Convert a orc struct field as though it were a map, by fieldIndex. Complex types will be transformed
 * into java lists and maps when possible ({@link OrcStructConverter#convertList} and
 * {@link OrcStructConverter#convertMap}), and
 * primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally,
 * if a field is not present, this method will return null.
 *
 * Note: "Union" types are not currently supported and will be returned as null
 */
@Nullable
Object convertField(OrcStruct struct, int fieldIndex) {
    if (fieldIndex < 0) {
        return null;
    }
    TypeDescription schema = struct.getSchema();
    TypeDescription fieldDescription = schema.getChildren().get(fieldIndex);
    WritableComparable fieldValue = struct.getFieldValue(fieldIndex);
    if (fieldValue == null) {
        return null;
    }
    if (fieldDescription.getCategory().isPrimitive()) {
        return convertPrimitive(fieldDescription, fieldValue, binaryAsString);
    } else {
        /*
          ORC TYPE    WRITABLE TYPE
          array       org.apache.orc.mapred.OrcList
          map         org.apache.orc.mapred.OrcMap
          struct      org.apache.orc.mapred.OrcStruct
          uniontype   org.apache.orc.mapred.OrcUnion
       */
        switch(fieldDescription.getCategory()) {
            case LIST:
                OrcList orcList = (OrcList) fieldValue;
                return convertList(fieldDescription, orcList, binaryAsString);
            case MAP:
                OrcMap map = (OrcMap) fieldValue;
                return convertMap(fieldDescription, map, binaryAsString);
            case STRUCT:
                OrcStruct structMap = (OrcStruct) fieldValue;
                return convertStructToMap(structMap);
            case UNION:
            // sorry union types :(
            default:
                return null;
        }
    }
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) WritableComparable(org.apache.hadoop.io.WritableComparable) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) OrcMap(org.apache.orc.mapred.OrcMap) Nullable(javax.annotation.Nullable)

Example 34 with WritableComparable

use of org.apache.hadoop.io.WritableComparable in project druid by druid-io.

the class OrcStructConverter method convertMap.

private static Map<Object, Object> convertMap(TypeDescription fieldDescription, OrcMap<? extends WritableComparable, ? extends WritableComparable> map, boolean binaryAsString) {
    Map<Object, Object> converted = new HashMap<>();
    TypeDescription keyDescription = fieldDescription.getChildren().get(0);
    TypeDescription valueDescription = fieldDescription.getChildren().get(1);
    for (WritableComparable key : map.navigableKeySet()) {
        Object newKey = convertPrimitive(keyDescription, key, binaryAsString);
        if (valueDescription.getCategory().isPrimitive()) {
            converted.put(newKey, convertPrimitive(valueDescription, map.get(key), binaryAsString));
        } else {
            converted.put(newKey, map.get(key));
        }
    }
    return converted;
}
Also used : HashMap(java.util.HashMap) Object2IntOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap) WritableComparable(org.apache.hadoop.io.WritableComparable) TypeDescription(org.apache.orc.TypeDescription)

Aggregations

WritableComparable (org.apache.hadoop.io.WritableComparable)34 IOException (java.io.IOException)14 Writable (org.apache.hadoop.io.Writable)14 Path (org.apache.hadoop.fs.Path)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 JobConf (org.apache.hadoop.mapred.JobConf)6 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)5 ArrayList (java.util.ArrayList)4 IntWritable (org.apache.hadoop.io.IntWritable)4 NullWritable (org.apache.hadoop.io.NullWritable)4 SequenceFile (org.apache.hadoop.io.SequenceFile)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 PCollection (com.tdunning.plume.PCollection)3 OutputChannel (com.tdunning.plume.local.lazy.MSCR.OutputChannel)3 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)3 HashMap (java.util.HashMap)3 BytesWritable (org.apache.hadoop.io.BytesWritable)3 FloatWritable (org.apache.hadoop.io.FloatWritable)3 HCatRecord (org.apache.hive.hcatalog.data.HCatRecord)3 DoFn (com.tdunning.plume.DoFn)2