use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class MapJoinTableContainerSerDe method load.
/**
* Loads the table container from a folder. Only used on Spark path.
* @param fs FileSystem of the folder.
* @param folder The folder to load table container.
* @param hconf The hive configuration
* @return Loaded table.
*/
@SuppressWarnings("unchecked")
public MapJoinTableContainer load(FileSystem fs, Path folder, Configuration hconf) throws HiveException {
try {
if (!fs.exists(folder)) {
return getDefaultEmptyContainer(keyContext, valueContext);
}
if (!fs.isDirectory(folder)) {
throw new HiveException("Error, not a directory: " + folder);
}
FileStatus[] fileStatuses = fs.listStatus(folder);
if (fileStatuses == null || fileStatuses.length == 0) {
return getDefaultEmptyContainer(keyContext, valueContext);
}
AbstractSerDe keySerDe = keyContext.getSerDe();
AbstractSerDe valueSerDe = valueContext.getSerDe();
Writable keyContainer = keySerDe.getSerializedClass().newInstance();
Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
MapJoinTableContainer tableContainer = null;
boolean useOptimizedContainer = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
for (FileStatus fileStatus : fileStatuses) {
Path filePath = fileStatus.getPath();
if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
throw new HiveException("Error, not a file: " + filePath);
}
InputStream is = null;
ObjectInputStream in = null;
try {
is = fs.open(filePath);
in = new ObjectInputStream(is);
String name = in.readUTF();
Map<String, String> metaData = (Map<String, String>) in.readObject();
if (tableContainer == null) {
tableContainer = useOptimizedContainer ? new MapJoinBytesTableContainer(hconf, valueContext, -1, 0) : create(name, metaData);
}
tableContainer.setSerde(keyContext, valueContext);
if (useOptimizedContainer) {
loadOptimized((MapJoinBytesTableContainer) tableContainer, in, keyContainer, valueContainer);
} else {
loadNormal((MapJoinPersistableTableContainer) tableContainer, in, keyContainer, valueContainer);
}
} finally {
if (in != null) {
in.close();
} else if (is != null) {
is.close();
}
}
}
if (tableContainer != null) {
tableContainer.seal();
}
return tableContainer;
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (Exception e) {
throw new HiveException("Error while trying to create table container", e);
}
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class MapJoinTableContainerSerDe method load.
@SuppressWarnings({ "unchecked" })
public /**
* Loads the table container. Only used on MR path.
* @param in Input stream.
* @return Loaded table.
*/
MapJoinPersistableTableContainer load(ObjectInputStream in) throws HiveException {
AbstractSerDe keySerDe = keyContext.getSerDe();
AbstractSerDe valueSerDe = valueContext.getSerDe();
MapJoinPersistableTableContainer tableContainer;
try {
String name = in.readUTF();
Map<String, String> metaData = (Map<String, String>) in.readObject();
tableContainer = create(name, metaData);
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (ClassNotFoundException e) {
throw new HiveException("Class Initialization error while trying to create table container", e);
}
try {
Writable keyContainer = keySerDe.getSerializedClass().newInstance();
Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
int numKeys = in.readInt();
for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
MapJoinKeyObject key = new MapJoinKeyObject();
key.read(keyContext, in, keyContainer);
MapJoinEagerRowContainer values = new MapJoinEagerRowContainer();
values.read(valueContext, in, valueContainer);
tableContainer.put(key, values);
}
return tableContainer;
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (Exception e) {
throw new HiveException("Error while trying to create table container", e);
}
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class MapJoinTableContainerSerDe method loadFastContainer.
/**
* Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path.
* @param mapJoinDesc The descriptor for the map join
* @param fs FileSystem of the folder.
* @param folder The folder to load table container.
* @param hconf The hive configuration
* @return Loaded table.
*/
@SuppressWarnings("unchecked")
public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, FileSystem fs, Path folder, Configuration hconf) throws HiveException {
try {
VectorMapJoinFastTableContainer tableContainer = new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1);
tableContainer.setSerde(keyContext, valueContext);
if (fs.exists(folder)) {
if (!fs.isDirectory(folder)) {
throw new HiveException("Error, not a directory: " + folder);
}
FileStatus[] fileStatuses = fs.listStatus(folder);
if (fileStatuses != null && fileStatuses.length > 0) {
AbstractSerDe keySerDe = keyContext.getSerDe();
AbstractSerDe valueSerDe = valueContext.getSerDe();
Writable key = keySerDe.getSerializedClass().newInstance();
Writable value = valueSerDe.getSerializedClass().newInstance();
for (FileStatus fileStatus : fileStatuses) {
Path filePath = fileStatus.getPath();
if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
throw new HiveException("Error, not a file: " + filePath);
}
InputStream is = null;
ObjectInputStream in = null;
try {
is = fs.open(filePath);
in = new ObjectInputStream(is);
// skip the name and metadata
in.readUTF();
in.readObject();
int numKeys = in.readInt();
for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
key.readFields(in);
long numRows = in.readLong();
for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) {
value.readFields(in);
tableContainer.putRow(key, value);
}
}
} finally {
if (in != null) {
in.close();
} else if (is != null) {
is.close();
}
}
}
}
}
tableContainer.seal();
return tableContainer;
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (Exception e) {
throw new HiveException("Error while trying to create table container", e);
}
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class TestInputOutputFormat method testRowNumberUniquenessInDifferentSplits.
/**
* also see {@link TestOrcFile#testPredicatePushdown()}
* This tests that {@link RecordReader#getRowNumber()} works with multiple splits
* @throws Exception
*/
@Test
public void testRowNumberUniquenessInDifferentSplits() throws Exception {
Properties properties = new Properties();
properties.setProperty("columns", "x,y");
properties.setProperty("columns.types", "int:int");
StructObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
// Save the conf variable values so that they can be restored later.
long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L);
long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L);
// Set the conf variable values for this test.
// 10000 bytes per stripe
long newStripeSize = 10000L;
// 1024 bytes per split
long newMaxSplitSize = 100L;
conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize);
conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize);
AbstractSerDe serde = new OrcSerde();
HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);
// The following loop should create 20 stripes in the orc file.
for (int i = 0; i < newStripeSize * 10; ++i) {
writer.write(serde.serialize(new MyRow(i, i + 1), inspector));
}
writer.close(true);
serde = new OrcSerde();
SerDeUtils.initializeSerDe(serde, conf, properties, null);
assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass());
inspector = (StructObjectInspector) serde.getObjectInspector();
assertEquals("struct<x:int,y:int>", inspector.getTypeName());
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
int numExpectedSplits = 20;
InputSplit[] splits = in.getSplits(conf, numExpectedSplits);
assertEquals(numExpectedSplits, splits.length);
for (int i = 0; i < numExpectedSplits; ++i) {
OrcSplit split = (OrcSplit) splits[i];
Reader.Options orcReaderOptions = new Reader.Options();
orcReaderOptions.range(split.getStart(), split.getLength());
OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
for (int j = 0; recordReader.hasNext(); j++) {
long rowNum = (i * 5000) + j;
long rowNumActual = recordReader.getRowNumber();
assertEquals("rowNum=" + rowNum, rowNum, rowNumActual);
Object row = recordReader.next(null);
}
recordReader.close();
}
// Reset the conf variable values that we changed for this test.
if (oldDefaultStripeSize != -1L) {
conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize);
} else {
// this means that nothing was set for default stripe size previously, so we should unset it.
conf.unset(OrcConf.STRIPE_SIZE.getHiveConfName());
}
if (oldMaxSplitSize != -1L) {
conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize);
} else {
// this means that nothing was set for default stripe size previously, so we should unset it.
conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname);
}
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class DynamicPartitionFileRecordWriterContainer method getLocalFileWriter.
@Override
protected LocalFileWriter getLocalFileWriter(HCatRecord value) throws IOException, HCatException {
OutputJobInfo localJobInfo = null;
// Calculate which writer to use from the remaining values - this needs to
// be done before we delete cols.
List<String> dynamicPartValues = new ArrayList<String>();
for (Integer colToAppend : dynamicPartCols) {
Object partitionValue = value.get(colToAppend);
dynamicPartValues.add(partitionValue == null ? HIVE_DEFAULT_PARTITION_VALUE : partitionValue.toString());
}
String dynKey = dynamicPartValues.toString();
if (!baseDynamicWriters.containsKey(dynKey)) {
if ((maxDynamicPartitions != -1) && (baseDynamicWriters.size() > maxDynamicPartitions)) {
throw new HCatException(ErrorType.ERROR_TOO_MANY_DYNAMIC_PTNS, "Number of dynamic partitions being created " + "exceeds configured max allowable partitions[" + maxDynamicPartitions + "], increase parameter [" + HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS.varname + "] if needed.");
}
org.apache.hadoop.mapred.TaskAttemptContext currTaskContext = HCatMapRedUtil.createTaskAttemptContext(context);
configureDynamicStorageHandler(currTaskContext, dynamicPartValues);
localJobInfo = HCatBaseOutputFormat.getJobInfo(currTaskContext.getConfiguration());
// Setup serDe.
AbstractSerDe currSerDe = ReflectionUtils.newInstance(storageHandler.getSerDeClass(), currTaskContext.getJobConf());
try {
InternalUtil.initializeOutputSerDe(currSerDe, currTaskContext.getConfiguration(), localJobInfo);
} catch (SerDeException e) {
throw new IOException("Failed to initialize SerDe", e);
}
// create base OutputFormat
org.apache.hadoop.mapred.OutputFormat baseOF = ReflectionUtils.newInstance(storageHandler.getOutputFormatClass(), currTaskContext.getJobConf());
// We are skipping calling checkOutputSpecs() for each partition
// As it can throw a FileAlreadyExistsException when more than one
// mapper is writing to a partition.
// See HCATALOG-490, also to avoid contacting the namenode for each new
// FileOutputFormat instance.
// In general this should be ok for most FileOutputFormat implementations
// but may become an issue for cases when the method is used to perform
// other setup tasks.
// Get Output Committer
org.apache.hadoop.mapred.OutputCommitter baseOutputCommitter = currTaskContext.getJobConf().getOutputCommitter();
// Create currJobContext the latest so it gets all the config changes
org.apache.hadoop.mapred.JobContext currJobContext = HCatMapRedUtil.createJobContext(currTaskContext);
// Set up job.
baseOutputCommitter.setupJob(currJobContext);
// Recreate to refresh jobConf of currTask context.
currTaskContext = HCatMapRedUtil.createTaskAttemptContext(currJobContext.getJobConf(), currTaskContext.getTaskAttemptID(), currTaskContext.getProgressible());
// Set temp location.
currTaskContext.getConfiguration().set("mapred.work.output.dir", new FileOutputCommitter(new Path(localJobInfo.getLocation()), currTaskContext).getWorkPath().toString());
// Set up task.
baseOutputCommitter.setupTask(currTaskContext);
Path parentDir = new Path(currTaskContext.getConfiguration().get("mapred.work.output.dir"));
Path childPath = new Path(parentDir, FileOutputFormat.getUniqueFile(currTaskContext, currTaskContext.getConfiguration().get("mapreduce.output.basename", "part"), ""));
RecordWriter baseRecordWriter = baseOF.getRecordWriter(parentDir.getFileSystem(currTaskContext.getConfiguration()), currTaskContext.getJobConf(), childPath.toString(), InternalUtil.createReporter(currTaskContext));
baseDynamicWriters.put(dynKey, baseRecordWriter);
baseDynamicSerDe.put(dynKey, currSerDe);
baseDynamicCommitters.put(dynKey, baseOutputCommitter);
dynamicContexts.put(dynKey, currTaskContext);
dynamicObjectInspectors.put(dynKey, InternalUtil.createStructObjectInspector(jobInfo.getOutputSchema()));
dynamicOutputJobInfo.put(dynKey, HCatOutputFormat.getJobInfo(dynamicContexts.get(dynKey).getConfiguration()));
}
return new LocalFileWriter(baseDynamicWriters.get(dynKey), dynamicObjectInspectors.get(dynKey), baseDynamicSerDe.get(dynKey), dynamicOutputJobInfo.get(dynKey));
}
Aggregations