use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class AbstractTestHiveFileFormats method createTestFile.
public static FileSplit createTestFile(String filePath, HiveStorageFormat storageFormat, HiveCompressionCodec compressionCodec, List<TestColumn> testColumns, int numRows) throws Exception {
HiveOutputFormat<?, ?> outputFormat = newInstance(storageFormat.getOutputFormat(), HiveOutputFormat.class);
@SuppressWarnings("deprecation") SerDe serDe = newInstance(storageFormat.getSerDe(), SerDe.class);
// filter out partition keys, which are not written to the file
testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey)));
Properties tableProperties = new Properties();
tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
serDe.initialize(new Configuration(), tableProperties);
JobConf jobConf = new JobConf();
configureCompression(jobConf, compressionCodec);
RecordWriter recordWriter = outputFormat.getHiveRecordWriter(jobConf, new Path(filePath), Text.class, compressionCodec != HiveCompressionCodec.NONE, tableProperties, () -> {
});
try {
serDe.initialize(new Configuration(), tableProperties);
SettableStructObjectInspector objectInspector = getStandardStructObjectInspector(ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector)));
Object row = objectInspector.create();
List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
for (int rowNumber = 0; rowNumber < numRows; rowNumber++) {
for (int i = 0; i < testColumns.size(); i++) {
Object writeValue = testColumns.get(i).getWriteValue();
if (writeValue instanceof Slice) {
writeValue = ((Slice) writeValue).getBytes();
}
objectInspector.setStructFieldData(row, fields.get(i), writeValue);
}
Writable record = serDe.serialize(row, objectInspector);
recordWriter.write(record);
}
} finally {
recordWriter.close(false);
}
// todo to test with compression, the file must be renamed with the compression extension
Path path = new Path(filePath);
path.getFileSystem(new Configuration()).setVerifyChecksum(true);
File file = new File(filePath);
return new FileSplit(path, 0, file.length(), new String[0]);
}
use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class TestHiveFileFormats method testCursorProvider.
private void testCursorProvider(HiveRecordCursorProvider cursorProvider, FileSplit split, HiveStorageFormat storageFormat, List<TestColumn> testColumns, int rowCount) throws IOException {
Properties splitProperties = new Properties();
splitProperties.setProperty(FILE_INPUT_FORMAT, storageFormat.getInputFormat());
splitProperties.setProperty(SERIALIZATION_LIB, storageFormat.getSerDe());
splitProperties.setProperty("columns", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getName)));
splitProperties.setProperty("columns.types", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getType)));
List<HivePartitionKey> partitionKeys = testColumns.stream().filter(TestColumn::isPartitionKey).map(input -> new HivePartitionKey(input.getName(), HiveType.valueOf(input.getObjectInspector().getTypeName()), (String) input.getWriteValue())).collect(toList());
Optional<ConnectorPageSource> pageSource = HivePageSourceProvider.createHivePageSource(ImmutableSet.of(cursorProvider), ImmutableSet.of(), "test", new Configuration(), SESSION, split.getPath(), OptionalInt.empty(), split.getStart(), split.getLength(), splitProperties, TupleDomain.all(), getColumnHandles(testColumns), partitionKeys, DateTimeZone.getDefault(), TYPE_MANAGER, ImmutableMap.of());
RecordCursor cursor = ((RecordPageSource) pageSource.get()).getCursor();
checkCursor(cursor, testColumns, rowCount);
}
use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class TestHiveFileFormats method testParquetThrift.
@Test(dataProvider = "rowCount")
public void testParquetThrift(int rowCount) throws Exception {
RowType nameType = new RowType(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), Optional.empty());
RowType phoneType = new RowType(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), Optional.empty());
RowType personType = new RowType(ImmutableList.of(nameType, INTEGER, createUnboundedVarcharType(), new ArrayType(phoneType)), Optional.empty());
List<TestColumn> testColumns = ImmutableList.of(new TestColumn("persons", getStandardListObjectInspector(getStandardStructObjectInspector(ImmutableList.of("name", "id", "email", "phones"), ImmutableList.of(getStandardStructObjectInspector(ImmutableList.of("first_name", "last_name"), ImmutableList.of(javaStringObjectInspector, javaStringObjectInspector)), javaIntObjectInspector, javaStringObjectInspector, getStandardListObjectInspector(getStandardStructObjectInspector(ImmutableList.of("number", "type"), ImmutableList.of(javaStringObjectInspector, javaStringObjectInspector)))))), null, arrayBlockOf(personType, rowBlockOf(ImmutableList.of(nameType, INTEGER, createUnboundedVarcharType(), new ArrayType(phoneType)), rowBlockOf(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), "Bob", "Roberts"), 0, "bob.roberts@example.com", arrayBlockOf(phoneType, rowBlockOf(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), "1234567890", null))))));
File file = new File(this.getClass().getClassLoader().getResource("addressbook.parquet").getPath());
FileSplit split = new FileSplit(new Path(file.getAbsolutePath()), 0, file.length(), new String[0]);
HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false, HDFS_ENVIRONMENT);
testCursorProvider(cursorProvider, split, PARQUET, testColumns, 1);
}
use of org.apache.hadoop.mapred.FileSplit in project hadoop-pcap by RIPE-NCC.
the class PcapInputFormat method getRecordReader.
@Override
public RecordReader<LongWritable, ObjectWritable> getRecordReader(InputSplit split, JobConf config, Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
Path path = fileSplit.getPath();
LOG.info("Reading PCAP: " + path.toString());
long start = 0L;
long length = fileSplit.getLength();
return initPcapRecordReader(path, start, length, reporter, config);
}
use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class HDFSInputStream method nextInputSplit.
private boolean nextInputSplit() throws IOException {
for (; currentSplitIndex < inputSplits.length; currentSplitIndex++) {
/**
* read all the partitions scheduled to the current node
*/
if (readSchedule[currentSplitIndex].equals(nodeName)) {
/**
* pick an unread split to read synchronize among
* simultaneous partitions in the same machine
*/
synchronized (read) {
if (read[currentSplitIndex] == false) {
read[currentSplitIndex] = true;
} else {
continue;
}
}
if (snapshot != null) {
String fileName = ((FileSplit) (inputSplits[currentSplitIndex])).getPath().toUri().getPath();
FileStatus fileStatus = hdfs.getFileStatus(new Path(fileName));
// Skip if not the same file stored in the files snapshot
if (fileStatus.getModificationTime() != snapshot.get(currentSplitIndex).getLastModefiedTime().getTime()) {
continue;
}
}
reader.close();
reader = getRecordReader(currentSplitIndex);
return true;
}
}
return false;
}
Aggregations