Search in sources :

Example 26 with AbstractSerDe

use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.

the class MapJoinTableContainerSerDe method load.

/**
 * Loads the table container from a folder. Only used on Spark path.
 * @param fs FileSystem of the folder.
 * @param folder The folder to load table container.
 * @param hconf The hive configuration
 * @return Loaded table.
 */
@SuppressWarnings("unchecked")
public MapJoinTableContainer load(FileSystem fs, Path folder, Configuration hconf) throws HiveException {
    try {
        if (!fs.exists(folder)) {
            return getDefaultEmptyContainer(keyContext, valueContext);
        }
        if (!fs.isDirectory(folder)) {
            throw new HiveException("Error, not a directory: " + folder);
        }
        FileStatus[] fileStatuses = fs.listStatus(folder);
        if (fileStatuses == null || fileStatuses.length == 0) {
            return getDefaultEmptyContainer(keyContext, valueContext);
        }
        AbstractSerDe keySerDe = keyContext.getSerDe();
        AbstractSerDe valueSerDe = valueContext.getSerDe();
        Writable keyContainer = keySerDe.getSerializedClass().newInstance();
        Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
        MapJoinTableContainer tableContainer = null;
        boolean useOptimizedContainer = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
        for (FileStatus fileStatus : fileStatuses) {
            Path filePath = fileStatus.getPath();
            if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
                throw new HiveException("Error, not a file: " + filePath);
            }
            InputStream is = null;
            ObjectInputStream in = null;
            try {
                is = fs.open(filePath);
                in = new ObjectInputStream(is);
                String name = in.readUTF();
                Map<String, String> metaData = (Map<String, String>) in.readObject();
                if (tableContainer == null) {
                    tableContainer = useOptimizedContainer ? new MapJoinBytesTableContainer(hconf, valueContext, -1, 0) : create(name, metaData);
                }
                tableContainer.setSerde(keyContext, valueContext);
                if (useOptimizedContainer) {
                    loadOptimized((MapJoinBytesTableContainer) tableContainer, in, keyContainer, valueContainer);
                } else {
                    loadNormal((MapJoinPersistableTableContainer) tableContainer, in, keyContainer, valueContainer);
                }
            } finally {
                if (in != null) {
                    in.close();
                } else if (is != null) {
                    is.close();
                }
            }
        }
        if (tableContainer != null) {
            tableContainer.seal();
        }
        return tableContainer;
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (Exception e) {
        throw new HiveException("Error while trying to create table container", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileStatus(org.apache.hadoop.fs.FileStatus) ObjectInputStream(java.io.ObjectInputStream) InputStream(java.io.InputStream) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ConcurrentModificationException(java.util.ConcurrentModificationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Map(java.util.Map) ObjectInputStream(java.io.ObjectInputStream)

Example 27 with AbstractSerDe

use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.

the class MapJoinTableContainerSerDe method load.

@SuppressWarnings({ "unchecked" })
public /**
 * Loads the table container. Only used on MR path.
 * @param in Input stream.
 * @return Loaded table.
 */
MapJoinPersistableTableContainer load(ObjectInputStream in) throws HiveException {
    AbstractSerDe keySerDe = keyContext.getSerDe();
    AbstractSerDe valueSerDe = valueContext.getSerDe();
    MapJoinPersistableTableContainer tableContainer;
    try {
        String name = in.readUTF();
        Map<String, String> metaData = (Map<String, String>) in.readObject();
        tableContainer = create(name, metaData);
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (ClassNotFoundException e) {
        throw new HiveException("Class Initialization error while trying to create table container", e);
    }
    try {
        Writable keyContainer = keySerDe.getSerializedClass().newInstance();
        Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
        int numKeys = in.readInt();
        for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
            MapJoinKeyObject key = new MapJoinKeyObject();
            key.read(keyContext, in, keyContainer);
            MapJoinEagerRowContainer values = new MapJoinEagerRowContainer();
            values.read(valueContext, in, valueContainer);
            tableContainer.put(key, values);
        }
        return tableContainer;
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (Exception e) {
        throw new HiveException("Error while trying to create table container", e);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ConcurrentModificationException(java.util.ConcurrentModificationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) Map(java.util.Map)

Example 28 with AbstractSerDe

use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.

the class MapJoinTableContainerSerDe method loadFastContainer.

/**
 * Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path.
 * @param mapJoinDesc The descriptor for the map join
 * @param fs FileSystem of the folder.
 * @param folder The folder to load table container.
 * @param hconf The hive configuration
 * @return Loaded table.
 */
@SuppressWarnings("unchecked")
public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, FileSystem fs, Path folder, Configuration hconf) throws HiveException {
    try {
        VectorMapJoinFastTableContainer tableContainer = new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1);
        tableContainer.setSerde(keyContext, valueContext);
        if (fs.exists(folder)) {
            if (!fs.isDirectory(folder)) {
                throw new HiveException("Error, not a directory: " + folder);
            }
            FileStatus[] fileStatuses = fs.listStatus(folder);
            if (fileStatuses != null && fileStatuses.length > 0) {
                AbstractSerDe keySerDe = keyContext.getSerDe();
                AbstractSerDe valueSerDe = valueContext.getSerDe();
                Writable key = keySerDe.getSerializedClass().newInstance();
                Writable value = valueSerDe.getSerializedClass().newInstance();
                for (FileStatus fileStatus : fileStatuses) {
                    Path filePath = fileStatus.getPath();
                    if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
                        throw new HiveException("Error, not a file: " + filePath);
                    }
                    InputStream is = null;
                    ObjectInputStream in = null;
                    try {
                        is = fs.open(filePath);
                        in = new ObjectInputStream(is);
                        // skip the name and metadata
                        in.readUTF();
                        in.readObject();
                        int numKeys = in.readInt();
                        for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
                            key.readFields(in);
                            long numRows = in.readLong();
                            for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) {
                                value.readFields(in);
                                tableContainer.putRow(key, value);
                            }
                        }
                    } finally {
                        if (in != null) {
                            in.close();
                        } else if (is != null) {
                            is.close();
                        }
                    }
                }
            }
        }
        tableContainer.seal();
        return tableContainer;
    } catch (IOException e) {
        throw new HiveException("IO error while trying to create table container", e);
    } catch (Exception e) {
        throw new HiveException("Error while trying to create table container", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileStatus(org.apache.hadoop.fs.FileStatus) ObjectInputStream(java.io.ObjectInputStream) InputStream(java.io.InputStream) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) ConcurrentModificationException(java.util.ConcurrentModificationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) VectorMapJoinFastTableContainer(org.apache.hadoop.hive.ql.exec.vector.mapjoin.fast.VectorMapJoinFastTableContainer) ObjectInputStream(java.io.ObjectInputStream)

Example 29 with AbstractSerDe

use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.

the class TestInputOutputFormat method testRowNumberUniquenessInDifferentSplits.

/**
 * also see {@link TestOrcFile#testPredicatePushdown()}
 * This tests that {@link RecordReader#getRowNumber()} works with multiple splits
 * @throws Exception
 */
@Test
public void testRowNumberUniquenessInDifferentSplits() throws Exception {
    Properties properties = new Properties();
    properties.setProperty("columns", "x,y");
    properties.setProperty("columns.types", "int:int");
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // Save the conf variable values so that they can be restored later.
    long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L);
    long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L);
    // Set the conf variable values for this test.
    // 10000 bytes per stripe
    long newStripeSize = 10000L;
    // 1024 bytes per split
    long newMaxSplitSize = 100L;
    conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize);
    conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize);
    AbstractSerDe serde = new OrcSerde();
    HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
    org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);
    // The following loop should create 20 stripes in the orc file.
    for (int i = 0; i < newStripeSize * 10; ++i) {
        writer.write(serde.serialize(new MyRow(i, i + 1), inspector));
    }
    writer.close(true);
    serde = new OrcSerde();
    SerDeUtils.initializeSerDe(serde, conf, properties, null);
    assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
    inspector = (StructObjectInspector) serde.getObjectInspector();
    assertEquals("struct<x:int,y:int>", inspector.getTypeName());
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    int numExpectedSplits = 20;
    InputSplit[] splits = in.getSplits(conf, numExpectedSplits);
    assertEquals(numExpectedSplits, splits.length);
    for (int i = 0; i < numExpectedSplits; ++i) {
        OrcSplit split = (OrcSplit) splits[i];
        Reader.Options orcReaderOptions = new Reader.Options();
        orcReaderOptions.range(split.getStart(), split.getLength());
        OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
        Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
        RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
        for (int j = 0; recordReader.hasNext(); j++) {
            long rowNum = (i * 5000) + j;
            long rowNumActual = recordReader.getRowNumber();
            assertEquals("rowNum=" + rowNum, rowNum, rowNumActual);
            Object row = recordReader.next(null);
        }
        recordReader.close();
    }
    // Reset the conf variable values that we changed for this test.
    if (oldDefaultStripeSize != -1L) {
        conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize);
    } else {
        // this means that nothing was set for default stripe size previously, so we should unset it.
        conf.unset(OrcConf.STRIPE_SIZE.getHiveConfName());
    }
    if (oldMaxSplitSize != -1L) {
        conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize);
    } else {
        // this means that nothing was set for default stripe size previously, so we should unset it.
        conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname);
    }
}
Also used : Properties(java.util.Properties) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) InputSplit(org.apache.hadoop.mapred.InputSplit) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 30 with AbstractSerDe

use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.

the class DynamicPartitionFileRecordWriterContainer method getLocalFileWriter.

@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
    OutputJobInfo localJobInfo = null;
    // Calculate which writer to use from the remaining values - this needs to
    // be done before we delete cols.
    List<String> dynamicPartValues = new ArrayList<String>();
    for (Integer colToAppend : dynamicPartCols) {
        Object partitionValue = value.get(colToAppend);
        dynamicPartValues.add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
    }
    String dynKey = dynamicPartValues.toString();
    if (!baseDynamicWriters.containsKey(dynKey)) {
        if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
            throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
        }
        org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil.createTaskAttemptContext(context);
        configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
        localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());
        // Setup serDe.
        AbstractSerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf());
        try {
            InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
        } catch (SerDeException e) {
            throw new IOException("Failed to initialize SerDe", e);
        }
        // create base OutputFormat
        org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());
        // We are skipping calling checkOutputSpecs() for each partition
        // As it can throw a FileAlreadyExistsException when more than one
        // mapper is writing to a partition.
        // See HCATALOG-490, also to avoid contacting the namenode for each new
        // FileOutputFormat instance.
        // In general this should be ok for most FileOutputFormat implementations
        // but may become an issue for cases when the method is used to perform
        // other setup tasks.
        // Get Output Committer
        org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf().getOutputCommitter();
        // Create currJobContext the latest so it gets all the config changes
        org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);
        // Set up job.
        baseOutputCommitter.setupJob(currJobContext);
        // Recreate to refresh jobConf of currTask context.
        currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
        // Set temp location.
        currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath().toString());
        // Set up task.
        baseOutputCommitter.setupTask(currTaskContext);
        Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));
        RecordWriter baseRecordWriter = baseOF.getRecordWriter(parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext));
        baseDynamicWriters.put(dynKey, baseRecordWriter);
        baseDynamicSerDe.put(dynKey, currSerDe);
        baseDynamicCommitters.put(dynKey, baseOutputCommitter);
        dynamicContexts.put(dynKey, currTaskContext);
        dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
        dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
    }
    return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}
Also used : Path(org.apache.hadoop.fs.Path) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapred.RecordWriter) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Aggregations

AbstractSerDe (org.apache.hadoop.hive.serde2.AbstractSerDe)43 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)25 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)17 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)15 ArrayList (java.util.ArrayList)12 Properties (java.util.Properties)12 BytesWritable (org.apache.hadoop.io.BytesWritable)11 IOException (java.io.IOException)8 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 Writable (org.apache.hadoop.io.Writable)8 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)7 InputSplit (org.apache.hadoop.mapred.InputSplit)7 Test (org.junit.Test)7 AbstractPrimitiveLazyObjectInspector (org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.AbstractPrimitiveLazyObjectInspector)6 LazyBinaryMapObjectInspector (org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryMapObjectInspector)6 JavaBinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaBinaryObjectInspector)6 WritableBinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector)6 LinkedHashMap (java.util.LinkedHashMap)5 Path (org.apache.hadoop.fs.Path)5 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)5