use of org.apache.hadoop.mapred.FileSplit in project presto by prestodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
setReadColumns(configuration, readHiveColumnIndexes);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
JobConf jobConf = new JobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
// propagate serialization configuration to getRecordReader
schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
try {
return retry().stopOnIllegalExceptions().run("createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
} catch (Exception e) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e);
}
}
use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class HDFSRecordReader method nextInputSplit.
private boolean nextInputSplit() throws IOException {
for (; currentSplitIndex < inputSplits.length; currentSplitIndex++) {
/**
* read all the partitions scheduled to the current node
*/
if (readSchedule[currentSplitIndex].equals(nodeName)) {
/**
* pick an unread split to read synchronize among
* simultaneous partitions in the same machine
*/
synchronized (read) {
if (read[currentSplitIndex] == false) {
read[currentSplitIndex] = true;
} else {
continue;
}
}
if (snapshot != null) {
String fileName = ((FileSplit) (inputSplits[currentSplitIndex])).getPath().toUri().getPath();
FileStatus fileStatus = hdfs.getFileStatus(new Path(fileName));
// Skip if not the same file stored in the files snapshot
if (fileStatus.getModificationTime() != snapshot.get(currentSplitIndex).getLastModefiedTime().getTime()) {
continue;
}
}
reader.close();
reader = getRecordReader(currentSplitIndex);
return true;
}
}
return false;
}
use of org.apache.hadoop.mapred.FileSplit in project asterixdb by apache.
the class HDFSUtils method getSplits.
/**
* Instead of creating the split using the input format, we do it manually
* This function returns fileSplits (1 per hdfs file block) irrespective of the number of partitions
* and the produced splits only cover intersection between current files in hdfs and files stored internally
* in AsterixDB
* 1. NoOp means appended file
* 2. AddOp means new file
* 3. UpdateOp means the delta of a file
*
* @return
* @throws IOException
*/
public static InputSplit[] getSplits(JobConf conf, List<ExternalFile> files) throws IOException {
// Create file system object
FileSystem fs = FileSystem.get(conf);
ArrayList<FileSplit> fileSplits = new ArrayList<>();
ArrayList<ExternalFile> orderedExternalFiles = new ArrayList<>();
// Create files splits
for (ExternalFile file : files) {
Path filePath = new Path(file.getFileName());
FileStatus fileStatus;
try {
fileStatus = fs.getFileStatus(filePath);
} catch (FileNotFoundException e) {
// file was deleted at some point, skip to next file
continue;
}
if (file.getPendingOp() == ExternalFilePendingOp.ADD_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
// Get its information from HDFS name node
BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, file.getSize());
// Create a split per block
for (BlockLocation block : fileBlocks) {
if (block.getOffset() < file.getSize()) {
fileSplits.add(new FileSplit(filePath, block.getOffset(), (block.getLength() + block.getOffset()) < file.getSize() ? block.getLength() : (file.getSize() - block.getOffset()), block.getHosts()));
orderedExternalFiles.add(file);
}
}
} else if (file.getPendingOp() == ExternalFilePendingOp.NO_OP && fileStatus.getModificationTime() == file.getLastModefiedTime().getTime()) {
long oldSize = 0L;
long newSize = file.getSize();
for (int i = 0; i < files.size(); i++) {
if (files.get(i).getFileName() == file.getFileName() && files.get(i).getSize() != file.getSize()) {
newSize = files.get(i).getSize();
oldSize = file.getSize();
break;
}
}
// Get its information from HDFS name node
BlockLocation[] fileBlocks = fs.getFileBlockLocations(fileStatus, 0, newSize);
// Create a split per block
for (BlockLocation block : fileBlocks) {
if (block.getOffset() + block.getLength() > oldSize) {
if (block.getOffset() < newSize) {
// Block interact with delta -> Create a split
long startCut = (block.getOffset() > oldSize) ? 0L : oldSize - block.getOffset();
long endCut = (block.getOffset() + block.getLength() < newSize) ? 0L : block.getOffset() + block.getLength() - newSize;
long splitLength = block.getLength() - startCut - endCut;
fileSplits.add(new FileSplit(filePath, block.getOffset() + startCut, splitLength, block.getHosts()));
orderedExternalFiles.add(file);
}
}
}
}
}
fs.close();
files.clear();
files.addAll(orderedExternalFiles);
return fileSplits.toArray(new FileSplit[fileSplits.size()]);
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class ParquetRecordReaderBase method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
ParquetInputSplit split;
if (oldSplit instanceof FileSplit) {
final Path finalPath = ((FileSplit) oldSplit).getPath();
jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
// TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
// MetadataFilter filter) API
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
// Compute stats
for (BlockMetaData bmd : blocks) {
serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
}
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
final long splitStart = ((FileSplit) oldSplit).getStart();
final long splitLength = ((FileSplit) oldSplit).getLength();
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
splitGroup.add(block);
}
}
if (splitGroup.isEmpty()) {
LOG.warn("Skipping split, could not find row group in: " + oldSplit);
return null;
}
FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
if (filter != null) {
filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
if (filtedBlocks.isEmpty()) {
LOG.debug("All row groups are dropped due to filter predicates");
return null;
}
long droppedBlocks = splitGroup.size() - filtedBlocks.size();
if (droppedBlocks > 0) {
LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
}
} else {
filtedBlocks = splitGroup;
}
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)) {
skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
}
split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
return split;
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class RCFileCat method run.
@Override
public int run(String[] args) throws Exception {
long start = 0l;
long length = -1l;
int recordCount = 0;
long startT = System.currentTimeMillis();
boolean verbose = false;
boolean columnSizes = false;
boolean pretty = false;
boolean fileSizes = false;
// get options from arguments
if (args.length < 1 || args.length > 3) {
printUsage(null);
return -1;
}
Path fileName = null;
for (int i = 0; i < args.length; i++) {
String arg = args[i];
if (arg.startsWith("--start=")) {
start = Long.parseLong(arg.substring("--start=".length()));
} else if (arg.startsWith("--length=")) {
length = Long.parseLong(arg.substring("--length=".length()));
} else if (arg.equals("--verbose")) {
verbose = true;
} else if (arg.equals("--column-sizes")) {
columnSizes = true;
} else if (arg.equals("--column-sizes-pretty")) {
columnSizes = true;
pretty = true;
} else if (arg.equals("--file-sizes")) {
fileSizes = true;
} else if (fileName == null) {
fileName = new Path(arg);
} else {
printUsage(null);
return -1;
}
}
setupBufferedOutput();
FileSystem fs = FileSystem.get(fileName.toUri(), conf);
long fileLen = fs.getFileStatus(fileName).getLen();
if (start < 0) {
start = 0;
}
if (start > fileLen) {
return 0;
}
if (length < 0 || (start + length) > fileLen) {
length = fileLen - start;
}
// share the code with RecordReader.
FileSplit split = new FileSplit(fileName, start, length, new JobConf(conf));
RCFileRecordReader recordReader = new RCFileRecordReader(conf, split);
if (columnSizes || fileSizes) {
// Print out the un/compressed sizes of each column
long[] compressedColumnSizes = null;
long[] uncompressedColumnSizes = null;
// un/compressed sizes of file and no. of rows
long rowNo = 0;
long uncompressedFileSize = 0;
long compressedFileSize = 0;
// Skip from block to block since we only need the header
while (recordReader.nextBlock()) {
// Get the sizes from the key buffer and aggregate
KeyBuffer keyBuffer = recordReader.getKeyBuffer();
if (uncompressedColumnSizes == null) {
uncompressedColumnSizes = new long[keyBuffer.getColumnNumber()];
}
if (compressedColumnSizes == null) {
compressedColumnSizes = new long[keyBuffer.getColumnNumber()];
}
for (int i = 0; i < keyBuffer.getColumnNumber(); i++) {
uncompressedColumnSizes[i] += keyBuffer.getEachColumnUncompressedValueLen()[i];
compressedColumnSizes[i] += keyBuffer.getEachColumnValueLen()[i];
}
rowNo += keyBuffer.getNumberRows();
}
if (columnSizes && uncompressedColumnSizes != null && compressedColumnSizes != null) {
// otherwise print it out as if it were a row
for (int i = 0; i < uncompressedColumnSizes.length; i++) {
if (pretty) {
System.out.println("Column " + i + ": Uncompressed size: " + uncompressedColumnSizes[i] + " Compressed size: " + compressedColumnSizes[i]);
} else {
System.out.print(i + TAB + uncompressedColumnSizes[i] + TAB + compressedColumnSizes[i] + NEWLINE);
}
}
}
if (fileSizes) {
if (uncompressedColumnSizes != null && compressedColumnSizes != null) {
for (int i = 0; i < uncompressedColumnSizes.length; i++) {
uncompressedFileSize += uncompressedColumnSizes[i];
compressedFileSize += compressedColumnSizes[i];
}
}
System.out.print("File size (uncompressed): " + uncompressedFileSize + ". File size (compressed): " + compressedFileSize + ". Number of rows: " + rowNo + "." + NEWLINE);
}
System.out.flush();
return 0;
}
LongWritable key = new LongWritable();
BytesRefArrayWritable value = new BytesRefArrayWritable();
// extra capacity in case we overrun, to avoid resizing
StringBuilder buf = new StringBuilder(STRING_BUFFER_SIZE);
while (recordReader.next(key, value)) {
printRecord(value, buf);
recordCount++;
if (verbose && (recordCount % RECORD_PRINT_INTERVAL) == 0) {
long now = System.currentTimeMillis();
System.err.println("Read " + recordCount / 1024 + "k records");
System.err.println("Read " + ((recordReader.getPos() / (1024L * 1024L))) + "MB");
System.err.printf("Input scan rate %.2f MB/s\n", (recordReader.getPos() * 1.0 / (now - startT)) / 1024.0);
}
if (buf.length() > STRING_BUFFER_FLUSH_SIZE) {
System.out.print(buf.toString());
buf.setLength(0);
}
}
// print out last part of buffer
System.out.print(buf.toString());
System.out.flush();
return 0;
}
Aggregations