Search in sources :

Example 1 with Reader

use of org.apache.hadoop.hive.ql.io.orc.Reader in project h2o-3 by h2oai.

the class OrcParserProvider method readSetup.

/**
   * This method will create the readers and others info needed to parse an orc file.
   * In addition, it will not over-ride the columnNames, columnTypes that the user
   * may want to force upon it.  However, we only allow users to set column types to
   * enum at this point and ignore all the other requests.
   *
   * @param f
   * @param columnNames
   * @param columnTypes
   * @return
   */
public ParseSetup readSetup(FileVec f, String[] columnNames, byte[] columnTypes) {
    try {
        Reader orcFileReader = getReader(f);
        StructObjectInspector insp = (StructObjectInspector) orcFileReader.getObjectInspector();
        OrcParser.OrcParseSetup stp = OrcParser.deriveParseSetup(orcFileReader, insp);
        // change back the columnNames and columnTypes if they are specified already
        if (!(columnNames == null) && (stp.getAllColNames().length == columnNames.length)) {
            // copy column name
            stp.setColumnNames(columnNames);
            stp.setAllColNames(columnNames);
        }
        if (!(columnTypes == null) && (columnTypes.length == stp.getColumnTypes().length)) {
            // copy enum type only
            byte[] old_columnTypes = stp.getColumnTypes();
            String[] old_columnTypeNames = stp.getColumnTypesString();
            for (int index = 0; index < columnTypes.length; index++) {
                if (// only copy the enum types
                columnTypes[index] == Vec.T_CAT)
                    old_columnTypes[index] = columnTypes[index];
            }
            stp.setColumnTypes(old_columnTypes);
            stp.setColumnTypeStrings(old_columnTypeNames);
        }
        List<StripeInformation> stripesInfo = orcFileReader.getStripes();
        if (stripesInfo.size() == 0) {
            // empty file
            f.setChunkSize(stp._chunk_size = (int) f.length());
            return stp;
        }
        f.setNChunks(stripesInfo.size());
        stp._chunk_size = f._chunkSize;
        // ORC parser needs one-to one mapping between chunk and strip (just ids, offsets do not matter)
        assert f.nChunks() == stripesInfo.size();
        return stp;
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}
Also used : Reader(org.apache.hadoop.hive.ql.io.orc.Reader) IOException(java.io.IOException) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 2 with Reader

use of org.apache.hadoop.hive.ql.io.orc.Reader in project h2o-3 by h2oai.

the class OrcTestUtils method compareOrcAndH2OFrame.

static int compareOrcAndH2OFrame(String fileName, File f, Set<String> failedFiles) throws IOException {
    Frame h2oFrame = null;
    try {
        Configuration conf = new Configuration();
        Path p = new Path(f.toString());
        Reader orcFileReader = OrcFile.createReader(p, OrcFile.readerOptions(conf));
        h2oFrame = water.TestUtil.parse_test_file(f.toString());
        return compareH2OFrame(fileName, failedFiles, h2oFrame, orcFileReader);
    } finally {
        if (h2oFrame != null)
            h2oFrame.delete();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Frame(water.fvec.Frame) Configuration(org.apache.hadoop.conf.Configuration) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader)

Example 3 with Reader

use of org.apache.hadoop.hive.ql.io.orc.Reader in project hive by apache.

the class TestStreaming method dumpBucket.

private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException {
    org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration());
    Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    StructObjectInspector inspector = (StructObjectInspector) reader.getObjectInspector();
    System.out.format("Found Bucket File : %s \n", orcFile.getName());
    ArrayList<SampleRec> result = new ArrayList<SampleRec>();
    while (rows.hasNext()) {
        Object row = rows.next(null);
        SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5];
        result.add(rec);
    }
    return result;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) ArrayList(java.util.ArrayList) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 4 with Reader

use of org.apache.hadoop.hive.ql.io.orc.Reader in project presto by prestodb.

the class OrcFileRewriter method rewrite.

public static OrcFileInfo rewrite(File input, File output, BitSet rowsToDelete) throws IOException {
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
        FileSystem fileSystem = new SyncingFileSystem(CONFIGURATION)) {
        Reader reader = createReader(fileSystem, path(input));
        if (reader.getNumberOfRows() < rowsToDelete.length()) {
            throw new IOException("File has fewer rows than deletion vector");
        }
        int deleteRowCount = rowsToDelete.cardinality();
        if (reader.getNumberOfRows() == deleteRowCount) {
            return new OrcFileInfo(0, 0);
        }
        if (reader.getNumberOfRows() >= Integer.MAX_VALUE) {
            throw new IOException("File has too many rows");
        }
        int inputRowCount = toIntExact(reader.getNumberOfRows());
        WriterOptions writerOptions = new OrcWriterOptions(CONFIGURATION).memory(new NullMemoryManager(CONFIGURATION)).fileSystem(fileSystem).compress(reader.getCompression()).inspector(reader.getObjectInspector());
        long start = System.nanoTime();
        try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close);
            Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) {
            if (reader.hasMetadataValue(OrcFileMetadata.KEY)) {
                ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY);
                writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
            }
            OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, inputRowCount);
            log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
            return fileInfo;
        }
    }
}
Also used : RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) OrcFile.createReader(org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) NullMemoryManager(org.apache.hadoop.hive.ql.io.orc.NullMemoryManager) ByteBuffer(java.nio.ByteBuffer) OrcWriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions) SyncingFileSystem(com.facebook.presto.raptor.util.SyncingFileSystem) SyncingFileSystem(com.facebook.presto.raptor.util.SyncingFileSystem) FileSystem(org.apache.hadoop.fs.FileSystem) OrcWriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions) WriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) OrcFile.createWriter(org.apache.hadoop.hive.ql.io.orc.OrcFile.createWriter)

Example 5 with Reader

use of org.apache.hadoop.hive.ql.io.orc.Reader in project DataX by alibaba.

the class DFSUtil method getAllColumnsCount.

private int getAllColumnsCount(String filePath) {
    int columnsCount;
    final String colFinal = "_col";
    Path path = new Path(filePath);
    try {
        Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(hadoopConf));
        String type_struct = reader.getObjectInspector().getTypeName();
        columnsCount = (type_struct.length() - type_struct.replace(colFinal, "").length()) / colFinal.length();
        return columnsCount;
    } catch (IOException e) {
        String message = "读取orcfile column列数失败,请联系系统管理员";
        throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) RCFileRecordReader(org.apache.hadoop.hive.ql.io.RCFileRecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) IOException(java.io.IOException)

Aggregations

Reader (org.apache.hadoop.hive.ql.io.orc.Reader)5 IOException (java.io.IOException)3 RecordReader (org.apache.hadoop.hive.ql.io.orc.RecordReader)3 Configuration (org.apache.hadoop.conf.Configuration)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 SyncingFileSystem (com.facebook.presto.raptor.util.SyncingFileSystem)1 ThreadContextClassLoader (com.facebook.presto.spi.classloader.ThreadContextClassLoader)1 InterruptedIOException (java.io.InterruptedIOException)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 RCFileRecordReader (org.apache.hadoop.hive.ql.io.RCFileRecordReader)1 NullMemoryManager (org.apache.hadoop.hive.ql.io.orc.NullMemoryManager)1 WriterOptions (org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions)1 OrcFile.createReader (org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader)1 OrcFile.createWriter (org.apache.hadoop.hive.ql.io.orc.OrcFile.createWriter)1 OrcWriterOptions (org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions)1 StripeInformation (org.apache.hadoop.hive.ql.io.orc.StripeInformation)1 Writer (org.apache.hadoop.hive.ql.io.orc.Writer)1