use of org.apache.hadoop.hive.ql.io.orc.Reader in project h2o-3 by h2oai.
the class OrcParserProvider method readSetup.
/**
* This method will create the readers and others info needed to parse an orc file.
* In addition, it will not over-ride the columnNames, columnTypes that the user
* may want to force upon it. However, we only allow users to set column types to
* enum at this point and ignore all the other requests.
*
* @param f
* @param columnNames
* @param columnTypes
* @return
*/
public ParseSetup readSetup(FileVec f, String[] columnNames, byte[] columnTypes) {
try {
Reader orcFileReader = getReader(f);
StructObjectInspector insp = (StructObjectInspector) orcFileReader.getObjectInspector();
OrcParser.OrcParseSetup stp = OrcParser.deriveParseSetup(orcFileReader, insp);
// change back the columnNames and columnTypes if they are specified already
if (!(columnNames == null) && (stp.getAllColNames().length == columnNames.length)) {
// copy column name
stp.setColumnNames(columnNames);
stp.setAllColNames(columnNames);
}
if (!(columnTypes == null) && (columnTypes.length == stp.getColumnTypes().length)) {
// copy enum type only
byte[] old_columnTypes = stp.getColumnTypes();
String[] old_columnTypeNames = stp.getColumnTypesString();
for (int index = 0; index < columnTypes.length; index++) {
if (// only copy the enum types
columnTypes[index] == Vec.T_CAT)
old_columnTypes[index] = columnTypes[index];
}
stp.setColumnTypes(old_columnTypes);
stp.setColumnTypeStrings(old_columnTypeNames);
}
List<StripeInformation> stripesInfo = orcFileReader.getStripes();
if (stripesInfo.size() == 0) {
// empty file
f.setChunkSize(stp._chunk_size = (int) f.length());
return stp;
}
f.setNChunks(stripesInfo.size());
stp._chunk_size = f._chunkSize;
// ORC parser needs one-to one mapping between chunk and strip (just ids, offsets do not matter)
assert f.nChunks() == stripesInfo.size();
return stp;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
use of org.apache.hadoop.hive.ql.io.orc.Reader in project h2o-3 by h2oai.
the class OrcTestUtils method compareOrcAndH2OFrame.
static int compareOrcAndH2OFrame(String fileName, File f, Set<String> failedFiles) throws IOException {
Frame h2oFrame = null;
try {
Configuration conf = new Configuration();
Path p = new Path(f.toString());
Reader orcFileReader = OrcFile.createReader(p, OrcFile.readerOptions(conf));
h2oFrame = water.TestUtil.parse_test_file(f.toString());
return compareH2OFrame(fileName, failedFiles, h2oFrame, orcFileReader);
} finally {
if (h2oFrame != null)
h2oFrame.delete();
}
}
use of org.apache.hadoop.hive.ql.io.orc.Reader in project hive by apache.
the class TestStreaming method dumpBucket.
private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException {
org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration());
Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
StructObjectInspector inspector = (StructObjectInspector) reader.getObjectInspector();
System.out.format("Found Bucket File : %s \n", orcFile.getName());
ArrayList<SampleRec> result = new ArrayList<SampleRec>();
while (rows.hasNext()) {
Object row = rows.next(null);
SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5];
result.add(rec);
}
return result;
}
use of org.apache.hadoop.hive.ql.io.orc.Reader in project presto by prestodb.
the class OrcFileRewriter method rewrite.
public static OrcFileInfo rewrite(File input, File output, BitSet rowsToDelete) throws IOException {
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
FileSystem fileSystem = new SyncingFileSystem(CONFIGURATION)) {
Reader reader = createReader(fileSystem, path(input));
if (reader.getNumberOfRows() < rowsToDelete.length()) {
throw new IOException("File has fewer rows than deletion vector");
}
int deleteRowCount = rowsToDelete.cardinality();
if (reader.getNumberOfRows() == deleteRowCount) {
return new OrcFileInfo(0, 0);
}
if (reader.getNumberOfRows() >= Integer.MAX_VALUE) {
throw new IOException("File has too many rows");
}
int inputRowCount = toIntExact(reader.getNumberOfRows());
WriterOptions writerOptions = new OrcWriterOptions(CONFIGURATION).memory(new NullMemoryManager(CONFIGURATION)).fileSystem(fileSystem).compress(reader.getCompression()).inspector(reader.getObjectInspector());
long start = System.nanoTime();
try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close);
Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) {
if (reader.hasMetadataValue(OrcFileMetadata.KEY)) {
ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY);
writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
}
OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, inputRowCount);
log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
return fileInfo;
}
}
}
use of org.apache.hadoop.hive.ql.io.orc.Reader in project DataX by alibaba.
the class DFSUtil method getAllColumnsCount.
private int getAllColumnsCount(String filePath) {
int columnsCount;
final String colFinal = "_col";
Path path = new Path(filePath);
try {
Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(hadoopConf));
String type_struct = reader.getObjectInspector().getTypeName();
columnsCount = (type_struct.length() - type_struct.replace(colFinal, "").length()) / colFinal.length();
return columnsCount;
} catch (IOException e) {
String message = "读取orcfile column列数失败,请联系系统管理员";
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
}
}
Aggregations