Search in sources :

Example 51 with Writable

use of org.apache.hadoop.io.Writable in project hive by apache.

the class HiveHFileOutputFormat method getHiveRecordWriter.

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException {
    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }
    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);
    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), progressable);
    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(tac);
    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }
    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = taskAttemptOutputdir;
                for (; ; ) {
                    FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                    if (files[0].isFile()) {
                        throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName);
                    }
                }
                for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                    fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                }
                // Hive actually wants a file as task output (not a directory), so
                // replace the empty directory with an empty file to keep it happy.
                fs.delete(taskAttemptOutputdir, true);
                fs.createNewFile(taskAttemptOutputdir);
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparator());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}
Also used : InterruptedIOException(java.io.InterruptedIOException) KeyValue(org.apache.hadoop.hbase.KeyValue) FileStatus(org.apache.hadoop.fs.FileStatus) Writable(org.apache.hadoop.io.Writable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) CellComparator(org.apache.hadoop.hbase.CellComparator) Job(org.apache.hadoop.mapreduce.Job) Cell(org.apache.hadoop.hbase.Cell) Path(org.apache.hadoop.fs.Path) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) Text(org.apache.hadoop.io.Text) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Map(java.util.Map) TreeMap(java.util.TreeMap) SortedMap(java.util.SortedMap)

Example 52 with Writable

use of org.apache.hadoop.io.Writable in project hive by apache.

the class FetchOperator method getRecordReader.

private RecordReader<WritableComparable, Writable> getRecordReader() throws Exception {
    if (!iterSplits.hasNext()) {
        FetchInputFormatSplit[] splits = getNextSplits();
        if (splits == null) {
            return null;
        }
        if (!isPartitioned || convertedOI == null) {
            currSerDe = tableSerDe;
            ObjectConverter = null;
        } else {
            currSerDe = needConversion(currDesc) ? currDesc.getDeserializer(job) : tableSerDe;
            ObjectInspector inputOI = currSerDe.getObjectInspector();
            ObjectConverter = ObjectInspectorConverters.getConverter(inputOI, convertedOI);
        }
        if (isPartitioned) {
            row[1] = createPartValue(currDesc, partKeyOI);
        }
        iterSplits = Arrays.asList(splits).iterator();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Creating fetchTask with deserializer typeinfo: " + currSerDe.getObjectInspector().getTypeName());
            LOG.debug("deserializer properties:\ntable properties: " + currDesc.getTableDesc().getProperties() + "\npartition properties: " + currDesc.getProperties());
        }
    }
    final FetchInputFormatSplit target = iterSplits.next();
    @SuppressWarnings("unchecked") final RecordReader<WritableComparable, Writable> reader = target.getRecordReader(job);
    if (hasVC || work.getSplitSample() != null) {
        currRecReader = new HiveRecordReader<WritableComparable, Writable>(reader, job) {

            @Override
            public boolean doNext(WritableComparable key, Writable value) throws IOException {
                // each split by table sampling, stop fetching any more (early exit)
                if (target.shrinkedLength > 0 && context.getIoCxt().getCurrentBlockStart() > target.shrinkedLength) {
                    return false;
                }
                return super.doNext(key, value);
            }
        };
        ((HiveContextAwareRecordReader) currRecReader).initIOContext(target, job, target.inputFormat.getClass(), reader);
    } else {
        currRecReader = reader;
    }
    key = currRecReader.createKey();
    value = currRecReader.createValue();
    headerCount = footerCount = 0;
    return currRecReader;
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) HiveContextAwareRecordReader(org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader) WritableComparable(org.apache.hadoop.io.WritableComparable) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException)

Example 53 with Writable

use of org.apache.hadoop.io.Writable in project hive by apache.

the class MapJoinTableContainerSerDe method loadFastContainer.

/**
   * Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path.
   * @param mapJoinDesc The descriptor for the map join
   * @param fs FileSystem of the folder.
   * @param folder The folder to load table container.
   * @param hconf The hive configuration
   * @return Loaded table.
   */
@SuppressWarnings("unchecked")
public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, FileSystem fs, Path folder, Configuration hconf) throws HiveException {
    try {
        VectorMapJoinFastTableContainer tableContainer = new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1);
        tableContainer.setSerde(keyContext, valueContext);
        if (fs.exists(folder)) {
            if (!fs.isDirectory(folder)) {
                throw new HiveException("Error, not a directory: " + folder);
            }
            FileStatus[] fileStatuses = fs.listStatus(folder);
            if (fileStatuses != null && fileStatuses.length > 0) {
                AbstractSerDe keySerDe = keyContext.getSerDe();
                AbstractSerDe valueSerDe = valueContext.getSerDe();
                Writable key = keySerDe.getSerializedClass().newInstance();
                Writable value = valueSerDe.getSerializedClass().newInstance();
                for (FileStatus fileStatus : fileStatuses) {
                    Path filePath = fileStatus.getPath();
                    if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
                        throw new HiveException("Error, not a file: " + filePath);
                    }
                    InputStream is = null;
                    ObjectInputStream in = null;
                    try {
                        is = fs.open(filePath, 4096);
                        in = new ObjectInputStream(is);
                        // skip the name and metadata
                        in.readUTF();
                        in.readObject();
                        int numKeys = in.readInt();
                        for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
                            key.readFields(in);
                            long numRows = in.readLong();
                            for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) {
                                value.readFields(in);
                                tableContainer.putRow(key, value);
                            }
                        }
                    } finally {
                        if (in != null) {
                            in.close();
                        } else if (is != null) {
                            is.close();
                        }
                    }
                }
            }
        }
        tableContainer.seal();
        return tableContainer;
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (Exception e) {
        throw new HiveException("Error while trying to create table container", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileStatus(org.apache.hadoop.fs.FileStatus) ObjectInputStream(java.io.ObjectInputStream) InputStream(java.io.InputStream) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ConcurrentModificationException(java.util.ConcurrentModificationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) VectorMapJoinFastTableContainer(org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastTableContainer) ObjectInputStream(java.io.ObjectInputStream)

Example 54 with Writable

use of org.apache.hadoop.io.Writable in project hive by apache.

the class MapJoinTableContainerSerDe method load.

@SuppressWarnings({ "unchecked" })
public /**
   * Loads the table container. Only used on MR path.
   * @param in Input stream.
   * @return Loaded table.
   */
MapJoinPersistableTableContainer load(ObjectInputStream in) throws HiveException {
    AbstractSerDe keySerDe = keyContext.getSerDe();
    AbstractSerDe valueSerDe = valueContext.getSerDe();
    MapJoinPersistableTableContainer tableContainer;
    try {
        String name = in.readUTF();
        Map<String, String> metaData = (Map<String, String>) in.readObject();
        tableContainer = create(name, metaData);
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (ClassNotFoundException e) {
        throw new HiveException("Class Initialization error while trying to create table container", e);
    }
    try {
        Writable keyContainer = keySerDe.getSerializedClass().newInstance();
        Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
        int numKeys = in.readInt();
        for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
            MapJoinKeyObject key = new MapJoinKeyObject();
            key.read(keyContext, in, keyContainer);
            MapJoinEagerRowContainer values = new MapJoinEagerRowContainer();
            values.read(valueContext, in, valueContainer);
            tableContainer.put(key, values);
        }
        return tableContainer;
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (Exception e) {
        throw new HiveException("Error while trying to create table container", e);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ConcurrentModificationException(java.util.ConcurrentModificationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Map(java.util.Map)

Example 55 with Writable

use of org.apache.hadoop.io.Writable in project hive by apache.

the class MapJoinTableContainerSerDe method load.

/**
   * Loads the table container from a folder. Only used on Spark path.
   * @param fs FileSystem of the folder.
   * @param folder The folder to load table container.
   * @param hconf The hive configuration
   * @return Loaded table.
   */
@SuppressWarnings("unchecked")
public MapJoinTableContainer load(FileSystem fs, Path folder, Configuration hconf) throws HiveException {
    try {
        if (!fs.exists(folder)) {
            return getDefaultEmptyContainer(keyContext, valueContext);
        }
        if (!fs.isDirectory(folder)) {
            throw new HiveException("Error, not a directory: " + folder);
        }
        FileStatus[] fileStatuses = fs.listStatus(folder);
        if (fileStatuses == null || fileStatuses.length == 0) {
            return getDefaultEmptyContainer(keyContext, valueContext);
        }
        AbstractSerDe keySerDe = keyContext.getSerDe();
        AbstractSerDe valueSerDe = valueContext.getSerDe();
        Writable keyContainer = keySerDe.getSerializedClass().newInstance();
        Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
        MapJoinTableContainer tableContainer = null;
        boolean useOptimizedContainer = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
        for (FileStatus fileStatus : fileStatuses) {
            Path filePath = fileStatus.getPath();
            if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
                throw new HiveException("Error, not a file: " + filePath);
            }
            InputStream is = null;
            ObjectInputStream in = null;
            try {
                is = fs.open(filePath, 4096);
                in = new ObjectInputStream(is);
                String name = in.readUTF();
                Map<String, String> metaData = (Map<String, String>) in.readObject();
                if (tableContainer == null) {
                    tableContainer = useOptimizedContainer ? new MapJoinBytesTableContainer(hconf, valueContext, -1, 0) : create(name, metaData);
                }
                tableContainer.setSerde(keyContext, valueContext);
                if (useOptimizedContainer) {
                    loadOptimized((MapJoinBytesTableContainer) tableContainer, in, keyContainer, valueContainer);
                } else {
                    loadNormal((MapJoinPersistableTableContainer) tableContainer, in, keyContainer, valueContainer);
                }
            } finally {
                if (in != null) {
                    in.close();
                } else if (is != null) {
                    is.close();
                }
            }
        }
        if (tableContainer != null) {
            tableContainer.seal();
        }
        return tableContainer;
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (Exception e) {
        throw new HiveException("Error while trying to create table container", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileStatus(org.apache.hadoop.fs.FileStatus) ObjectInputStream(java.io.ObjectInputStream) InputStream(java.io.InputStream) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ConcurrentModificationException(java.util.ConcurrentModificationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Map(java.util.Map) ObjectInputStream(java.io.ObjectInputStream)

Aggregations

Writable (org.apache.hadoop.io.Writable)221 IntWritable (org.apache.hadoop.io.IntWritable)103 LongWritable (org.apache.hadoop.io.LongWritable)91 BooleanWritable (org.apache.hadoop.io.BooleanWritable)75 BytesWritable (org.apache.hadoop.io.BytesWritable)74 FloatWritable (org.apache.hadoop.io.FloatWritable)73 Test (org.junit.Test)68 IOException (java.io.IOException)43 Path (org.apache.hadoop.fs.Path)43 Text (org.apache.hadoop.io.Text)40 ArrayWritable (org.apache.hadoop.io.ArrayWritable)37 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)34 SequenceFile (org.apache.hadoop.io.SequenceFile)32 Configuration (org.apache.hadoop.conf.Configuration)31 DoubleWritable (org.apache.hadoop.io.DoubleWritable)30 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)29 ByteWritable (org.apache.hadoop.io.ByteWritable)28 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)25 FileSystem (org.apache.hadoop.fs.FileSystem)24 ArrayList (java.util.ArrayList)23