Search in sources :

Example 46 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getRecordReader.

@Override
public RecordReader<NullWritable, BSONWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
    if (split instanceof BSONFileSplit || !isSplitable(fs, fileSplit.getPath())) {
        BSONFileRecordReader reader = new BSONFileRecordReader();
        reader.initialize(split, job);
        return reader;
    }
    // Split was not created by BSONSplitter.
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(job);
    splitter.setInputPath(fileSplit.getPath());
    org.apache.hadoop.mapreduce.lib.input.FileSplit newStyleFileSplit = new org.apache.hadoop.mapreduce.lib.input.FileSplit(fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength(), fileSplit.getLocations());
    long start = splitter.getStartingPositionForSplit(newStyleFileSplit);
    BSONFileRecordReader reader = new BSONFileRecordReader(start);
    reader.initialize(fileSplit, job);
    return reader;
}
Also used : BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) BSONFileRecordReader(com.mongodb.hadoop.mapred.input.BSONFileRecordReader) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 47 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormatTest method enronEmails.

@Test
public void enronEmails() throws IOException {
    BSONFileInputFormat inputFormat = new BSONFileInputFormat();
    JobConf job = new JobConf();
    String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
    // Hadoop 2.X
    job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
    // Hadoop 1.2.X
    job.set("mapred.input.dir", inputDirectory);
    FileSplit[] splits = inputFormat.getSplits(job, 5);
    int count = 0;
    BSONWritable writable = new BSONWritable();
    for (FileSplit split : splits) {
        RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
        while (recordReader.next(null, writable)) {
            count++;
        }
    }
    assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
Also used : BSONWritable(com.mongodb.hadoop.io.BSONWritable) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) NullWritable(org.apache.hadoop.io.NullWritable) Test(org.junit.Test)

Example 48 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormatTest method enronEmails.

@Test
public void enronEmails() throws IOException {
    BSONFileInputFormat inputFormat = new BSONFileInputFormat();
    JobConf job = new JobConf();
    String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
    // Hadoop 2.X
    job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
    // Hadoop 1.2.X
    job.set("mapred.input.dir", inputDirectory);
    FileSplit[] splits = inputFormat.getSplits(job, 5);
    int count = 0;
    BSONWritable writable = new BSONWritable();
    for (FileSplit split : splits) {
        RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
        while (recordReader.next(null, writable)) {
            count++;
        }
    }
    assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
Also used : BSONWritable(com.mongodb.hadoop.io.BSONWritable) BSONFileInputFormat(com.mongodb.hadoop.mapred.BSONFileInputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) NullWritable(org.apache.hadoop.io.NullWritable) Test(org.junit.Test)

Example 49 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hadoop by apache.

the class AutoInputFormat method getRecordReader.

public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
    FSDataInputStream is = fs.open(fileSplit.getPath());
    byte[] header = new byte[3];
    RecordReader reader = null;
    try {
        is.readFully(header);
    } catch (EOFException eof) {
        reader = textInputFormat.getRecordReader(split, job, reporter);
    } finally {
        is.close();
    }
    if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
        reader = seqFileInputFormat.getRecordReader(split, job, reporter);
    } else {
        reader = textInputFormat.getRecordReader(split, job, reporter);
    }
    return reader;
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) RecordReader(org.apache.hadoop.mapred.RecordReader) EOFException(java.io.EOFException) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 50 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.

the class AbstractTestHiveFileFormats method createTestFile.

public static FileSplit createTestFile(String filePath, HiveStorageFormat storageFormat, HiveCompressionCodec compressionCodec, List<TestColumn> testColumns, ConnectorSession session, int numRows, HiveFileWriterFactory fileWriterFactory) throws Exception {
    // filter out partition keys, which are not written to the file
    testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
    List<Type> types = testColumns.stream().map(TestColumn::getType).map(HiveType::valueOf).map(type -> type.getType(TYPE_MANAGER)).collect(toList());
    PageBuilder pageBuilder = new PageBuilder(types);
    for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
        pageBuilder.declarePosition();
        for (int columnNumber = 0; columnNumber < testColumns.size(); columnNumber++) {
            serializeObject(types.get(columnNumber), pageBuilder.getBlockBuilder(columnNumber), testColumns.get(columnNumber).getWriteValue(), testColumns.get(columnNumber).getObjectInspector(), false);
        }
    }
    Page page = pageBuilder.build();
    JobConf jobConf = new JobConf();
    configureCompression(jobConf, compressionCodec);
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
    tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
    Optional<HiveFileWriter> fileWriter = fileWriterFactory.createFileWriter(new Path(filePath), testColumns.stream().map(TestColumn::getName).collect(toList()), StorageFormat.fromHiveStorageFormat(storageFormat), tableProperties, jobConf, session);
    HiveFileWriter hiveFileWriter = fileWriter.orElseThrow(() -> new IllegalArgumentException("fileWriterFactory"));
    hiveFileWriter.appendRows(page);
    hiveFileWriter.commit();
    return new FileSplit(new Path(filePath), 0, new File(filePath).length(), new String[0]);
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) TypeManager(com.facebook.presto.spi.type.TypeManager) JavaHiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector) HIVE_DEFAULT_DYNAMIC_PARTITION(com.facebook.presto.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION) TypeRegistry(com.facebook.presto.type.TypeRegistry) PrimitiveObjectInspectorFactory.javaByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector) Text(org.apache.hadoop.io.Text) PrimitiveObjectInspectorFactory.javaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector) SqlDate(com.facebook.presto.spi.type.SqlDate) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) PrimitiveObjectInspectorFactory.javaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector) RowType(com.facebook.presto.type.RowType) PrimitiveObjectInspectorFactory.javaDateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector) BIGINT(com.facebook.presto.spi.type.BigintType.BIGINT) CharType.createCharType(com.facebook.presto.spi.type.CharType.createCharType) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) BigDecimal(java.math.BigDecimal) FileSplit(org.apache.hadoop.mapred.FileSplit) BOOLEAN(com.facebook.presto.spi.type.BooleanType.BOOLEAN) Predicates.not(com.google.common.base.Predicates.not) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) BigInteger(java.math.BigInteger) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Assert.assertFalse(org.testng.Assert.assertFalse) PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector) Decimals(com.facebook.presto.spi.type.Decimals) TINYINT(com.facebook.presto.spi.type.TinyintType.TINYINT) PrimitiveObjectInspectorFactory.javaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector) PrimitiveObjectInspectorFactory.javaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector) StorageFormat(com.facebook.presto.hive.metastore.StorageFormat) StructuralTestUtil.rowBlockOf(com.facebook.presto.tests.StructuralTestUtil.rowBlockOf) StructuralTestUtil.decimalArrayBlockOf(com.facebook.presto.tests.StructuralTestUtil.decimalArrayBlockOf) PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector) ConnectorSession(com.facebook.presto.spi.ConnectorSession) Iterables.filter(com.google.common.collect.Iterables.filter) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) SqlDecimal(com.facebook.presto.spi.type.SqlDecimal) Joiner(com.google.common.base.Joiner) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) JavaHiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveCharObjectInspector) DOUBLE(com.facebook.presto.spi.type.DoubleType.DOUBLE) Slice(io.airlift.slice.Slice) SliceOutput(io.airlift.slice.SliceOutput) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) HiveUtil.isStructuralType(com.facebook.presto.hive.HiveUtil.isStructuralType) MapType(com.facebook.presto.type.MapType) ArrayList(java.util.ArrayList) PrimitiveObjectInspectorFactory.javaShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector) Type(com.facebook.presto.spi.type.Type) Arrays.fill(java.util.Arrays.fill) Properties(java.util.Properties) IOException(java.io.IOException) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) UTC(org.joda.time.DateTimeZone.UTC) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) VarcharType.createUnboundedVarcharType(com.facebook.presto.spi.type.VarcharType.createUnboundedVarcharType) PageBuilder(com.facebook.presto.spi.PageBuilder) BlockSerdeUtil(com.facebook.presto.block.BlockSerdeUtil) SqlVarbinary(com.facebook.presto.spi.type.SqlVarbinary) SerDeUtils.serializeObject(com.facebook.presto.hive.util.SerDeUtils.serializeObject) Page(com.facebook.presto.spi.Page) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) Block(com.facebook.presto.spi.block.Block) Iterables.transform(com.google.common.collect.Iterables.transform) DateType(com.facebook.presto.spi.type.DateType) MaterializedResult.materializeSourceDataStream(com.facebook.presto.testing.MaterializedResult.materializeSourceDataStream) SqlTimestamp(com.facebook.presto.spi.type.SqlTimestamp) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) DecimalType(com.facebook.presto.spi.type.DecimalType) TimestampType(com.facebook.presto.spi.type.TimestampType) Path(org.apache.hadoop.fs.Path) DateTimeFormat(org.joda.time.format.DateTimeFormat) ImmutableMap(com.google.common.collect.ImmutableMap) Timestamp(java.sql.Timestamp) HdfsConfigurationUpdater.configureCompression(com.facebook.presto.hive.HdfsConfigurationUpdater.configureCompression) BlockBuilder(com.facebook.presto.spi.block.BlockBuilder) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) SerDe(org.apache.hadoop.hive.serde2.SerDe) RecordCursor(com.facebook.presto.spi.RecordCursor) List(java.util.List) StructuralTestUtil.arrayBlockOf(com.facebook.presto.tests.StructuralTestUtil.arrayBlockOf) Optional(java.util.Optional) INTEGER(com.facebook.presto.spi.type.IntegerType.INTEGER) PARTITION_KEY(com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY) StructuralTestUtil.decimalMapBlockOf(com.facebook.presto.tests.StructuralTestUtil.decimalMapBlockOf) Varchars.isVarcharType(com.facebook.presto.spi.type.Varchars.isVarcharType) TypeInfoFactory.getCharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo) Strings.padEnd(com.google.common.base.Strings.padEnd) ArrayType(com.facebook.presto.type.ArrayType) PrimitiveObjectInspectorFactory.javaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector) Assert.assertEquals(org.testng.Assert.assertEquals) HashMap(java.util.HashMap) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) StructuralTestUtil.mapBlockOf(com.facebook.presto.tests.StructuralTestUtil.mapBlockOf) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) ImmutableList(com.google.common.collect.ImmutableList) SESSION(com.facebook.presto.hive.HiveTestUtils.SESSION) StructuralTestUtil(com.facebook.presto.tests.StructuralTestUtil) Objects.requireNonNull(java.util.Objects.requireNonNull) BlockBuilderStatus(com.facebook.presto.spi.block.BlockBuilderStatus) VARBINARY(com.facebook.presto.spi.type.VarbinaryType.VARBINARY) ObjectInspectorFactory.getStandardMapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector) UTF_8(java.nio.charset.StandardCharsets.UTF_8) DateTime(org.joda.time.DateTime) SMALLINT(com.facebook.presto.spi.type.SmallintType.SMALLINT) ObjectInspectorFactory.getStandardListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector) Date(java.sql.Date) CharType(com.facebook.presto.spi.type.CharType) JobConf(org.apache.hadoop.mapred.JobConf) TimeUnit(java.util.concurrent.TimeUnit) MaterializedResult(com.facebook.presto.testing.MaterializedResult) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) REAL(com.facebook.presto.spi.type.RealType.REAL) MaterializedRow(com.facebook.presto.testing.MaterializedRow) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) Chars.isCharType(com.facebook.presto.spi.type.Chars.isCharType) Path(org.apache.hadoop.fs.Path) Page(com.facebook.presto.spi.Page) PageBuilder(com.facebook.presto.spi.PageBuilder) Properties(java.util.Properties) FileSplit(org.apache.hadoop.mapred.FileSplit) RowType(com.facebook.presto.type.RowType) CharType.createCharType(com.facebook.presto.spi.type.CharType.createCharType) HiveUtil.isStructuralType(com.facebook.presto.hive.HiveUtil.isStructuralType) MapType(com.facebook.presto.type.MapType) Type(com.facebook.presto.spi.type.Type) VarcharType.createUnboundedVarcharType(com.facebook.presto.spi.type.VarcharType.createUnboundedVarcharType) DateType(com.facebook.presto.spi.type.DateType) DecimalType(com.facebook.presto.spi.type.DecimalType) TimestampType(com.facebook.presto.spi.type.TimestampType) Varchars.isVarcharType(com.facebook.presto.spi.type.Varchars.isVarcharType) ArrayType(com.facebook.presto.type.ArrayType) CharType(com.facebook.presto.spi.type.CharType) Chars.isCharType(com.facebook.presto.spi.type.Chars.isCharType) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)66 Path (org.apache.hadoop.fs.Path)38 InputSplit (org.apache.hadoop.mapred.InputSplit)23 JobConf (org.apache.hadoop.mapred.JobConf)16 File (java.io.File)10 IOException (java.io.IOException)10 Configuration (org.apache.hadoop.conf.Configuration)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Test (org.junit.Test)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 InputFormat (org.apache.hadoop.mapred.InputFormat)4 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 VertexLocationHint (org.apache.tez.dag.api.VertexLocationHint)4