use of org.apache.hadoop.io.Writable in project hive by apache.
the class HiveHFileOutputFormat method getHiveRecordWriter.
@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException {
// Read configuration for the target path, first from jobconf, then from table properties
String hfilePath = getFamilyPath(jc, tableProperties);
if (hfilePath == null) {
throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
}
// Target path's last component is also the column family name.
final Path columnFamilyPath = new Path(hfilePath);
final String columnFamilyName = columnFamilyPath.getName();
final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
final Job job = new Job(jc);
setCompressOutput(job, isCompressed);
setOutputPath(job, finalOutPath);
// Create the HFile writer
final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), progressable);
final Path outputdir = FileOutputFormat.getOutputPath(tac);
final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(tac);
// Individual columns are going to be pivoted to HBase cells,
// and for each row, they need to be written out in order
// of column name, so sort the column names now, creating a
// mapping to their column position. However, the first
// column is interpreted as the row key.
String columnList = tableProperties.getProperty("columns");
String[] columnArray = columnList.split(",");
final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
int i = 0;
for (String columnName : columnArray) {
if (i != 0) {
columnMap.put(Bytes.toBytes(columnName), i);
}
++i;
}
return new RecordWriter() {
@Override
public void close(boolean abort) throws IOException {
try {
fileWriter.close(null);
if (abort) {
return;
}
// Move the hfiles file(s) from the task output directory to the
// location specified by the user.
FileSystem fs = outputdir.getFileSystem(jc);
fs.mkdirs(columnFamilyPath);
Path srcDir = taskAttemptOutputdir;
for (; ; ) {
FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
if ((files == null) || (files.length == 0)) {
throw new IOException("No family directories found in " + srcDir);
}
if (files.length != 1) {
throw new IOException("Multiple family directories found in " + srcDir);
}
srcDir = files[0].getPath();
if (srcDir.getName().equals(columnFamilyName)) {
break;
}
if (files[0].isFile()) {
throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName);
}
}
for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
}
// Hive actually wants a file as task output (not a directory), so
// replace the empty directory with an empty file to keep it happy.
fs.delete(taskAttemptOutputdir, true);
fs.createNewFile(taskAttemptOutputdir);
} catch (InterruptedException ex) {
throw new IOException(ex);
}
}
private void writeText(Text text) throws IOException {
// Decompose the incoming text row into fields.
String s = text.toString();
String[] fields = s.split("");
assert (fields.length <= (columnMap.size() + 1));
// First field is the row key.
byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
// Remaining fields are cells addressed by column name within row.
for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
byte[] columnNameBytes = entry.getKey();
int iColumn = entry.getValue();
String val;
if (iColumn >= fields.length) {
// trailing blank field
val = "";
} else {
val = fields[iColumn];
if ("\\N".equals(val)) {
// omit nulls
continue;
}
}
byte[] valBytes = Bytes.toBytes(val);
KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
try {
fileWriter.write(null, kv);
} catch (IOException e) {
LOG.error("Failed while writing row: " + s);
throw e;
} catch (InterruptedException ex) {
throw new IOException(ex);
}
}
}
private void writePut(PutWritable put) throws IOException {
ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
Collections.sort(entry.getValue(), new CellComparator());
for (Cell c : entry.getValue()) {
try {
fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
} catch (InterruptedException e) {
throw (InterruptedIOException) new InterruptedIOException().initCause(e);
}
}
}
}
@Override
public void write(Writable w) throws IOException {
if (w instanceof Text) {
writeText((Text) w);
} else if (w instanceof PutWritable) {
writePut((PutWritable) w);
} else {
throw new IOException("Unexpected writable " + w);
}
}
};
}
use of org.apache.hadoop.io.Writable in project hive by apache.
the class FetchOperator method getRecordReader.
private RecordReader<WritableComparable, Writable> getRecordReader() throws Exception {
if (!iterSplits.hasNext()) {
FetchInputFormatSplit[] splits = getNextSplits();
if (splits == null) {
return null;
}
if (!isPartitioned || convertedOI == null) {
currSerDe = tableSerDe;
ObjectConverter = null;
} else {
currSerDe = needConversion(currDesc) ? currDesc.getDeserializer(job) : tableSerDe;
ObjectInspector inputOI = currSerDe.getObjectInspector();
ObjectConverter = ObjectInspectorConverters.getConverter(inputOI, convertedOI);
}
if (isPartitioned) {
row[1] = createPartValue(currDesc, partKeyOI);
}
iterSplits = Arrays.asList(splits).iterator();
if (LOG.isDebugEnabled()) {
LOG.debug("Creating fetchTask with deserializer typeinfo: " + currSerDe.getObjectInspector().getTypeName());
LOG.debug("deserializer properties:\ntable properties: " + currDesc.getTableDesc().getProperties() + "\npartition properties: " + currDesc.getProperties());
}
}
final FetchInputFormatSplit target = iterSplits.next();
@SuppressWarnings("unchecked") final RecordReader<WritableComparable, Writable> reader = target.getRecordReader(job);
if (hasVC || work.getSplitSample() != null) {
currRecReader = new HiveRecordReader<WritableComparable, Writable>(reader, job) {
@Override
public boolean doNext(WritableComparable key, Writable value) throws IOException {
// each split by table sampling, stop fetching any more (early exit)
if (target.shrinkedLength > 0 && context.getIoCxt().getCurrentBlockStart() > target.shrinkedLength) {
return false;
}
return super.doNext(key, value);
}
};
((HiveContextAwareRecordReader) currRecReader).initIOContext(target, job, target.inputFormat.getClass(), reader);
} else {
currRecReader = reader;
}
key = currRecReader.createKey();
value = currRecReader.createValue();
headerCount = footerCount = 0;
return currRecReader;
}
use of org.apache.hadoop.io.Writable in project hive by apache.
the class MapJoinTableContainerSerDe method loadFastContainer.
/**
* Loads the small table into a VectorMapJoinFastTableContainer. Only used on Spark path.
* @param mapJoinDesc The descriptor for the map join
* @param fs FileSystem of the folder.
* @param folder The folder to load table container.
* @param hconf The hive configuration
* @return Loaded table.
*/
@SuppressWarnings("unchecked")
public MapJoinTableContainer loadFastContainer(MapJoinDesc mapJoinDesc, FileSystem fs, Path folder, Configuration hconf) throws HiveException {
try {
VectorMapJoinFastTableContainer tableContainer = new VectorMapJoinFastTableContainer(mapJoinDesc, hconf, -1);
tableContainer.setSerde(keyContext, valueContext);
if (fs.exists(folder)) {
if (!fs.isDirectory(folder)) {
throw new HiveException("Error, not a directory: " + folder);
}
FileStatus[] fileStatuses = fs.listStatus(folder);
if (fileStatuses != null && fileStatuses.length > 0) {
AbstractSerDe keySerDe = keyContext.getSerDe();
AbstractSerDe valueSerDe = valueContext.getSerDe();
Writable key = keySerDe.getSerializedClass().newInstance();
Writable value = valueSerDe.getSerializedClass().newInstance();
for (FileStatus fileStatus : fileStatuses) {
Path filePath = fileStatus.getPath();
if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
throw new HiveException("Error, not a file: " + filePath);
}
InputStream is = null;
ObjectInputStream in = null;
try {
is = fs.open(filePath, 4096);
in = new ObjectInputStream(is);
// skip the name and metadata
in.readUTF();
in.readObject();
int numKeys = in.readInt();
for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
key.readFields(in);
long numRows = in.readLong();
for (long rowIndex = 0L; rowIndex < numRows; rowIndex++) {
value.readFields(in);
tableContainer.putRow(key, value);
}
}
} finally {
if (in != null) {
in.close();
} else if (is != null) {
is.close();
}
}
}
}
}
tableContainer.seal();
return tableContainer;
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (Exception e) {
throw new HiveException("Error while trying to create table container", e);
}
}
use of org.apache.hadoop.io.Writable in project hive by apache.
the class MapJoinTableContainerSerDe method load.
@SuppressWarnings({ "unchecked" })
public /**
* Loads the table container. Only used on MR path.
* @param in Input stream.
* @return Loaded table.
*/
MapJoinPersistableTableContainer load(ObjectInputStream in) throws HiveException {
AbstractSerDe keySerDe = keyContext.getSerDe();
AbstractSerDe valueSerDe = valueContext.getSerDe();
MapJoinPersistableTableContainer tableContainer;
try {
String name = in.readUTF();
Map<String, String> metaData = (Map<String, String>) in.readObject();
tableContainer = create(name, metaData);
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (ClassNotFoundException e) {
throw new HiveException("Class Initialization error while trying to create table container", e);
}
try {
Writable keyContainer = keySerDe.getSerializedClass().newInstance();
Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
int numKeys = in.readInt();
for (int keyIndex = 0; keyIndex < numKeys; keyIndex++) {
MapJoinKeyObject key = new MapJoinKeyObject();
key.read(keyContext, in, keyContainer);
MapJoinEagerRowContainer values = new MapJoinEagerRowContainer();
values.read(valueContext, in, valueContainer);
tableContainer.put(key, values);
}
return tableContainer;
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (Exception e) {
throw new HiveException("Error while trying to create table container", e);
}
}
use of org.apache.hadoop.io.Writable in project hive by apache.
the class MapJoinTableContainerSerDe method load.
/**
* Loads the table container from a folder. Only used on Spark path.
* @param fs FileSystem of the folder.
* @param folder The folder to load table container.
* @param hconf The hive configuration
* @return Loaded table.
*/
@SuppressWarnings("unchecked")
public MapJoinTableContainer load(FileSystem fs, Path folder, Configuration hconf) throws HiveException {
try {
if (!fs.exists(folder)) {
return getDefaultEmptyContainer(keyContext, valueContext);
}
if (!fs.isDirectory(folder)) {
throw new HiveException("Error, not a directory: " + folder);
}
FileStatus[] fileStatuses = fs.listStatus(folder);
if (fileStatuses == null || fileStatuses.length == 0) {
return getDefaultEmptyContainer(keyContext, valueContext);
}
AbstractSerDe keySerDe = keyContext.getSerDe();
AbstractSerDe valueSerDe = valueContext.getSerDe();
Writable keyContainer = keySerDe.getSerializedClass().newInstance();
Writable valueContainer = valueSerDe.getSerializedClass().newInstance();
MapJoinTableContainer tableContainer = null;
boolean useOptimizedContainer = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
for (FileStatus fileStatus : fileStatuses) {
Path filePath = fileStatus.getPath();
if (ShimLoader.getHadoopShims().isDirectory(fileStatus)) {
throw new HiveException("Error, not a file: " + filePath);
}
InputStream is = null;
ObjectInputStream in = null;
try {
is = fs.open(filePath, 4096);
in = new ObjectInputStream(is);
String name = in.readUTF();
Map<String, String> metaData = (Map<String, String>) in.readObject();
if (tableContainer == null) {
tableContainer = useOptimizedContainer ? new MapJoinBytesTableContainer(hconf, valueContext, -1, 0) : create(name, metaData);
}
tableContainer.setSerde(keyContext, valueContext);
if (useOptimizedContainer) {
loadOptimized((MapJoinBytesTableContainer) tableContainer, in, keyContainer, valueContainer);
} else {
loadNormal((MapJoinPersistableTableContainer) tableContainer, in, keyContainer, valueContainer);
}
} finally {
if (in != null) {
in.close();
} else if (is != null) {
is.close();
}
}
}
if (tableContainer != null) {
tableContainer.seal();
}
return tableContainer;
} catch (IOException e) {
throw new HiveException("IO error while trying to create table container", e);
} catch (Exception e) {
throw new HiveException("Error while trying to create table container", e);
}
}
Aggregations