Search in sources :

Example 16 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.

the class AbstractTestHiveFileFormats method createTestFile.

public static FileSplit createTestFile(String filePath, HiveStorageFormat storageFormat, HiveCompressionCodec compressionCodec, List<TestColumn> testColumns, int numRows) throws Exception {
    HiveOutputFormat<?, ?> outputFormat = newInstance(storageFormat.getOutputFormat(), HiveOutputFormat.class);
    @SuppressWarnings("deprecation") SerDe serDe = newInstance(storageFormat.getSerDe(), SerDe.class);
    // filter out partition keys, which are not written to the file
    testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
    Properties tableProperties = new Properties();
    tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
    tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
    serDe.initialize(new Configuration(), tableProperties);
    JobConf jobConf = new JobConf();
    configureCompression(jobConf, compressionCodec);
    RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(filePath), Text.class, compressionCodec != HiveCompressionCodec.NONE, tableProperties, () -> {
    });
    try {
        serDe.initialize(new Configuration(), tableProperties);
        SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector)));
        Object row = objectInspector.create();
        List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
        for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
            for (int i = 0; i < testColumns.size(); i++) {
                Object writeValue = testColumns.get(i).getWriteValue();
                if (writeValue instanceof Slice) {
                    writeValue = ((Slice) writeValue).getBytes();
                }
                objectInspector.setStructFieldData(row, fields.get(i), writeValue);
            }
            Writable record = serDe.serialize(row, objectInspector);
            recordWriter.write(record);
        }
    } finally {
        recordWriter.close(false);
    }
    // todo to test with compression, the file must be renamed with the compression extension
    Path path = new Path(filePath);
    path.getFileSystem(new Configuration()).setVerifyChecksum(true);
    File file = new File(filePath);
    return new FileSplit(path, 0, file.length(), new String[0]);
}
Also used : SerDe(org.apache.hadoop.hive.serde2.SerDe) Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) FileSplit(org.apache.hadoop.mapred.FileSplit) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) Slice(io.airlift.slice.Slice) SerDeUtils.serializeObject(com.facebook.presto.hive.util.SerDeUtils.serializeObject) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File)

Example 17 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.

the class TestHiveFileFormats method testCursorProvider.

private void testCursorProvider(HiveRecordCursorProvider cursorProvider, FileSplit split, HiveStorageFormat storageFormat, List<TestColumn> testColumns, int rowCount) throws IOException {
    Properties splitProperties = new Properties();
    splitProperties.setProperty(FILE_INPUT_FORMAT, storageFormat.getInputFormat());
    splitProperties.setProperty(SERIALIZATION_LIB, storageFormat.getSerDe());
    splitProperties.setProperty("columns", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getName)));
    splitProperties.setProperty("columns.types", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getType)));
    List<HivePartitionKey> partitionKeys = testColumns.stream().filter(TestColumn::isPartitionKey).map(input -> new HivePartitionKey(input.getName(), HiveType.valueOf(input.getObjectInspector().getTypeName()), (String) input.getWriteValue())).collect(toList());
    Optional<ConnectorPageSource> pageSource = HivePageSourceProvider.createHivePageSource(ImmutableSet.of(cursorProvider), ImmutableSet.of(), "test", new Configuration(), SESSION, split.getPath(), OptionalInt.empty(), split.getStart(), split.getLength(), splitProperties, TupleDomain.all(), getColumnHandles(testColumns), partitionKeys, DateTimeZone.getDefault(), TYPE_MANAGER, ImmutableMap.of());
    RecordCursor cursor = ((RecordPageSource) pageSource.get()).getCursor();
    checkCursor(cursor, testColumns, rowCount);
}
Also used : RecordPageSource(com.facebook.presto.spi.RecordPageSource) DateTimeZone(org.joda.time.DateTimeZone) ORC(com.facebook.presto.hive.HiveStorageFormat.ORC) Iterables.transform(com.google.common.collect.Iterables.transform) OrcPageSourceFactory(com.facebook.presto.hive.orc.OrcPageSourceFactory) Test(org.testng.annotations.Test) RowType(com.facebook.presto.type.RowType) FileSplit(org.apache.hadoop.mapred.FileSplit) Predicates.not(com.google.common.base.Predicates.not) Configuration(org.apache.hadoop.conf.Configuration) AVRO(com.facebook.presto.hive.HiveStorageFormat.AVRO) Path(org.apache.hadoop.fs.Path) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ImmutableCollectors.toImmutableList(com.facebook.presto.util.ImmutableCollectors.toImmutableList) TEXTFILE(com.facebook.presto.hive.HiveStorageFormat.TEXTFILE) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) TimeZone(java.util.TimeZone) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) BeforeClass(org.testng.annotations.BeforeClass) DWRF(com.facebook.presto.hive.HiveStorageFormat.DWRF) StructuralTestUtil.rowBlockOf(com.facebook.presto.tests.StructuralTestUtil.rowBlockOf) Assert.assertNotNull(org.testng.Assert.assertNotNull) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Collectors(java.util.stream.Collectors) ConnectorSession(com.facebook.presto.spi.ConnectorSession) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) RecordCursor(com.facebook.presto.spi.RecordCursor) List(java.util.List) StructuralTestUtil.arrayBlockOf(com.facebook.presto.tests.StructuralTestUtil.arrayBlockOf) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) TYPE_MANAGER(com.facebook.presto.hive.HiveTestUtils.TYPE_MANAGER) Optional(java.util.Optional) INTEGER(com.facebook.presto.spi.type.IntegerType.INTEGER) Iterables.filter(com.google.common.collect.Iterables.filter) PrimitiveObjectInspectorFactory.javaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector) Joiner(com.google.common.base.Joiner) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) DataProvider(org.testng.annotations.DataProvider) PrimitiveCategory(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory) ArrayType(com.facebook.presto.type.ArrayType) HiveTestUtils.getTypes(com.facebook.presto.hive.HiveTestUtils.getTypes) RcFilePageSourceFactory(com.facebook.presto.hive.rcfile.RcFilePageSourceFactory) Assert.assertEquals(org.testng.Assert.assertEquals) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector) PARQUET(com.facebook.presto.hive.HiveStorageFormat.PARQUET) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) Lists(com.google.common.collect.Lists) ImmutableList(com.google.common.collect.ImmutableList) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) SESSION(com.facebook.presto.hive.HiveTestUtils.SESSION) DwrfPageSourceFactory(com.facebook.presto.hive.orc.DwrfPageSourceFactory) RCTEXT(com.facebook.presto.hive.HiveStorageFormat.RCTEXT) Objects.requireNonNull(java.util.Objects.requireNonNull) ParquetRecordCursorProvider(com.facebook.presto.hive.parquet.ParquetRecordCursorProvider) JSON(com.facebook.presto.hive.HiveStorageFormat.JSON) SEQUENCEFILE(com.facebook.presto.hive.HiveStorageFormat.SEQUENCEFILE) Properties(java.util.Properties) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) ObjectInspectorFactory.getStandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector) TestingConnectorSession(com.facebook.presto.testing.TestingConnectorSession) ObjectInspectorFactory.getStandardListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector) File(java.io.File) RCBINARY(com.facebook.presto.hive.HiveStorageFormat.RCBINARY) VarcharType.createUnboundedVarcharType(com.facebook.presto.spi.type.VarcharType.createUnboundedVarcharType) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) HDFS_ENVIRONMENT(com.facebook.presto.hive.HiveTestUtils.HDFS_ENVIRONMENT) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) Assert.assertTrue(org.testng.Assert.assertTrue) PrimitiveObjectInspectorFactory.javaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector) ParquetPageSourceFactory(com.facebook.presto.hive.parquet.ParquetPageSourceFactory) RecordCursor(com.facebook.presto.spi.RecordCursor) Configuration(org.apache.hadoop.conf.Configuration) Properties(java.util.Properties) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) RecordPageSource(com.facebook.presto.spi.RecordPageSource)

Example 18 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.

the class TestHiveFileFormats method testParquetThrift.

@Test(dataProvider = "rowCount")
public void testParquetThrift(int rowCount) throws Exception {
    RowType nameType = new RowType(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), Optional.empty());
    RowType phoneType = new RowType(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), Optional.empty());
    RowType personType = new RowType(ImmutableList.of(nameType, INTEGER, createUnboundedVarcharType(), new ArrayType(phoneType)), Optional.empty());
    List<TestColumn> testColumns = ImmutableList.of(new TestColumn("persons", getStandardListObjectInspector(getStandardStructObjectInspector(ImmutableList.of("name", "id", "email", "phones"), ImmutableList.of(getStandardStructObjectInspector(ImmutableList.of("first_name", "last_name"), ImmutableList.of(javaStringObjectInspector, javaStringObjectInspector)), javaIntObjectInspector, javaStringObjectInspector, getStandardListObjectInspector(getStandardStructObjectInspector(ImmutableList.of("number", "type"), ImmutableList.of(javaStringObjectInspector, javaStringObjectInspector)))))), null, arrayBlockOf(personType, rowBlockOf(ImmutableList.of(nameType, INTEGER, createUnboundedVarcharType(), new ArrayType(phoneType)), rowBlockOf(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), "Bob", "Roberts"), 0, "bob.roberts@example.com", arrayBlockOf(phoneType, rowBlockOf(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), "1234567890", null))))));
    File file = new File(this.getClass().getClassLoader().getResource("addressbook.parquet").getPath());
    FileSplit split = new FileSplit(new Path(file.getAbsolutePath()), 0, file.length(), new String[0]);
    HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false, HDFS_ENVIRONMENT);
    testCursorProvider(cursorProvider, split, PARQUET, testColumns, 1);
}
Also used : ArrayType(com.facebook.presto.type.ArrayType) Path(org.apache.hadoop.fs.Path) ParquetRecordCursorProvider(com.facebook.presto.hive.parquet.ParquetRecordCursorProvider) RowType(com.facebook.presto.type.RowType) FileSplit(org.apache.hadoop.mapred.FileSplit) File(java.io.File) Test(org.testng.annotations.Test)

Example 19 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project hadoop-pcap by RIPE-NCC.

the class PcapInputFormat method getRecordReader.

@Override
public RecordReader<LongWritable, ObjectWritable> getRecordReader(InputSplit split, JobConf config, Reporter reporter) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();
    LOG.info("Reading PCAP: " + path.toString());
    long start = 0L;
    long length = fileSplit.getLength();
    return initPcapRecordReader(path, start, length, reporter, config);
}
Also used : Path(org.apache.hadoop.fs.Path) FileSplit(org.apache.hadoop.mapred.FileSplit)

Example 20 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.

the class HDFSInputStream method nextInputSplit.

private boolean nextInputSplit() throws IOException {
    for (; currentSplitIndex < inputSplits.length; currentSplitIndex++) {
        /**
             * read all the partitions scheduled to the current node
             */
        if (readSchedule[currentSplitIndex].equals(nodeName)) {
            /**
                 * pick an unread split to read synchronize among
                 * simultaneous partitions in the same machine
                 */
            synchronized (read) {
                if (read[currentSplitIndex] == false) {
                    read[currentSplitIndex] = true;
                } else {
                    continue;
                }
            }
            if (snapshot != null) {
                String fileName = ((FileSplit) (inputSplits[currentSplitIndex])).getPath().toUri().getPath();
                FileStatus fileStatus = hdfs.getFileStatus(new Path(fileName));
                // Skip if not the same file stored in the files snapshot
                if (fileStatus.getModificationTime() != snapshot.get(currentSplitIndex).getLastModefiedTime().getTime()) {
                    continue;
                }
            }
            reader.close();
            reader = getRecordReader(currentSplitIndex);
            return true;
        }
    }
    return false;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) FileSplit(org.apache.hadoop.mapred.FileSplit)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)61 Path (org.apache.hadoop.fs.Path)36 InputSplit (org.apache.hadoop.mapred.InputSplit)21 JobConf (org.apache.hadoop.mapred.JobConf)15 File (java.io.File)10 IOException (java.io.IOException)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Configuration (org.apache.hadoop.conf.Configuration)9 RecordReader (org.apache.hadoop.mapred.RecordReader)8 ArrayList (java.util.ArrayList)7 Properties (java.util.Properties)7 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)7 Test (org.junit.Test)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)4 ClusterTopology (org.apache.hyracks.api.topology.ClusterTopology)4 RecordCursor (com.facebook.presto.spi.RecordCursor)3 INTEGER (com.facebook.presto.spi.type.IntegerType.INTEGER)3