Search in sources :

Example 31 with LongWritable

use of org.apache.hadoop.io.LongWritable in project hadoop by apache.

the class TestTextInputFormat method testFormat.

@Test(timeout = 500000)
public void testFormat() throws Exception {
    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "test.txt");
    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        LOG.debug("creating; entries = " + length);
        // create a file with length entries
        Writer writer = new OutputStreamWriter(localFs.create(file));
        try {
            for (int i = 0; i < length; i++) {
                writer.write(Integer.toString(i));
                writer.write("\n");
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        TextInputFormat format = new TextInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
            LOG.debug("splitting: requesting = " + numSplits);
            InputSplit[] splits = format.getSplits(job, numSplits);
            LOG.debug("splitting: got =        " + splits.length);
            if (length == 0) {
                assertEquals("Files of length 0 are not returned from FileInputFormat.getSplits().", 1, splits.length);
                assertEquals("Empty file length == 0", 0, splits[0].getLength());
            }
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.length; j++) {
                LOG.debug("split[" + j + "]= " + splits[j]);
                RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);
                try {
                    int count = 0;
                    while (reader.next(key, value)) {
                        int v = Integer.parseInt(value.toString());
                        LOG.debug("read " + v);
                        if (bits.get(v)) {
                            LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());
                        }
                        assertFalse("Key in multiple partitions.", bits.get(v));
                        bits.set(v);
                        count++;
                    }
                    LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) Text(org.apache.hadoop.io.Text) Random(java.util.Random) OutputStreamWriter(java.io.OutputStreamWriter) LongWritable(org.apache.hadoop.io.LongWritable) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) Test(org.junit.Test)

Example 32 with LongWritable

use of org.apache.hadoop.io.LongWritable in project hadoop by apache.

the class TestAutoInputFormat method testFormat.

@SuppressWarnings({ "unchecked", "deprecation" })
@Test
public void testFormat() throws IOException {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path txtFile = new Path(dir, "auto.txt");
    Path seqFile = new Path(dir, "auto.seq");
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    Writer txtWriter = new OutputStreamWriter(fs.create(txtFile));
    try {
        for (int i = 0; i < LINES_COUNT; i++) {
            txtWriter.write("" + (10 * i));
            txtWriter.write("\n");
        }
    } finally {
        txtWriter.close();
    }
    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, seqFile, IntWritable.class, LongWritable.class);
    try {
        for (int i = 0; i < RECORDS_COUNT; i++) {
            IntWritable key = new IntWritable(11 * i);
            LongWritable value = new LongWritable(12 * i);
            seqWriter.append(key, value);
        }
    } finally {
        seqWriter.close();
    }
    AutoInputFormat format = new AutoInputFormat();
    InputSplit[] splits = format.getSplits(job, SPLITS_COUNT);
    for (InputSplit split : splits) {
        RecordReader reader = format.getRecordReader(split, job, Reporter.NULL);
        Object key = reader.createKey();
        Object value = reader.createValue();
        try {
            while (reader.next(key, value)) {
                if (key instanceof LongWritable) {
                    assertEquals("Wrong value class.", Text.class, value.getClass());
                    assertTrue("Invalid value", Integer.parseInt(((Text) value).toString()) % 10 == 0);
                } else {
                    assertEquals("Wrong key class.", IntWritable.class, key.getClass());
                    assertEquals("Wrong value class.", LongWritable.class, value.getClass());
                    assertTrue("Invalid key.", ((IntWritable) key).get() % 11 == 0);
                    assertTrue("Invalid value.", ((LongWritable) value).get() % 12 == 0);
                }
            }
        } finally {
            reader.close();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AutoInputFormat(org.apache.hadoop.streaming.AutoInputFormat) RecordReader(org.apache.hadoop.mapred.RecordReader) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 33 with LongWritable

use of org.apache.hadoop.io.LongWritable in project flink by apache.

the class HadoopIOFormatsITCase method preSubmit.

@Override
protected void preSubmit() throws Exception {
    resultPath = new String[] { getTempDirPath("result0"), getTempDirPath("result1") };
    File sequenceFile = createAndRegisterTempFile("seqFile");
    sequenceFileInPath = sequenceFile.toURI().toString();
    // Create a sequence file
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    FileSystem fs = FileSystem.get(URI.create(sequenceFile.getAbsolutePath()), conf);
    Path path = new Path(sequenceFile.getAbsolutePath());
    //  ------------------ Long / Text Key Value pair: ------------
    int kvCount = 4;
    LongWritable key = new LongWritable();
    Text value = new Text();
    SequenceFile.Writer writer = null;
    try {
        writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());
        for (int i = 0; i < kvCount; i++) {
            if (i == 1) {
                // write key = 0 a bit more often.
                for (int a = 0; a < 15; a++) {
                    key.set(i);
                    value.set(i + " - somestring");
                    writer.append(key, value);
                }
            }
            key.set(i);
            value.set(i + " - somestring");
            writer.append(key, value);
        }
    } finally {
        IOUtils.closeStream(writer);
    }
    //  ------------------ Long / Text Key Value pair: ------------
    File sequenceFileNull = createAndRegisterTempFile("seqFileNullKey");
    sequenceFileInPathNull = sequenceFileNull.toURI().toString();
    path = new Path(sequenceFileInPathNull);
    LongWritable value1 = new LongWritable();
    SequenceFile.Writer writer1 = null;
    try {
        writer1 = SequenceFile.createWriter(fs, conf, path, NullWritable.class, value1.getClass());
        for (int i = 0; i < kvCount; i++) {
            value1.set(i);
            writer1.append(NullWritable.get(), value1);
        }
    } finally {
        IOUtils.closeStream(writer1);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.flink.configuration.Configuration) Text(org.apache.hadoop.io.Text) NullWritable(org.apache.hadoop.io.NullWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Example 34 with LongWritable

use of org.apache.hadoop.io.LongWritable in project flink by apache.

the class WordCountMapredITCase method internalRun.

private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<LongWritable, Text>> input;
    if (isTestDeprecatedAPI) {
        input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
    } else {
        input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    }
    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {

        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });
    DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).groupBy(0).sum(1);
    DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

        @Override
        public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
            return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
        }
    });
    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));
    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapred.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Tokenizer(org.apache.flink.test.testfunctions.Tokenizer) JobConf(org.apache.hadoop.mapred.JobConf)

Example 35 with LongWritable

use of org.apache.hadoop.io.LongWritable in project flink by apache.

the class WordCountMapreduceITCase method internalRun.

private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Tuple2<LongWritable, Text>> input;
    if (isTestDeprecatedAPI) {
        input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
    } else {
        input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    }
    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {

        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });
    DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
    text.flatMap(new Tokenizer()).groupBy(0).sum(1);
    DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

        @Override
        public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
            return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
        }
    });
    // Set up Hadoop Output Format
    Job job = Job.getInstance();
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
    job.getConfiguration().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(job, new Path(resultPath));
    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}
Also used : Path(org.apache.hadoop.fs.Path) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Text(org.apache.hadoop.io.Text) HadoopOutputFormat(org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) Tuple2(org.apache.flink.api.java.tuple.Tuple2) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) Tokenizer(org.apache.flink.test.testfunctions.Tokenizer)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)445 Text (org.apache.hadoop.io.Text)220 Test (org.junit.Test)171 IntWritable (org.apache.hadoop.io.IntWritable)102 Path (org.apache.hadoop.fs.Path)99 BytesWritable (org.apache.hadoop.io.BytesWritable)70 FloatWritable (org.apache.hadoop.io.FloatWritable)68 Configuration (org.apache.hadoop.conf.Configuration)62 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)62 BooleanWritable (org.apache.hadoop.io.BooleanWritable)60 ArrayList (java.util.ArrayList)59 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)57 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)53 IOException (java.io.IOException)49 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)48 SequenceFile (org.apache.hadoop.io.SequenceFile)42 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)40 FileSystem (org.apache.hadoop.fs.FileSystem)37 JobConf (org.apache.hadoop.mapred.JobConf)37 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)35