use of java.util.SortedMap in project hive by apache.
the class HiveHFileOutputFormat method getHiveRecordWriter.
@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException {
// Read configuration for the target path, first from jobconf, then from table properties
String hfilePath = getFamilyPath(jc, tableProperties);
if (hfilePath == null) {
throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
}
// Target path's last component is also the column family name.
final Path columnFamilyPath = new Path(hfilePath);
final String columnFamilyName = columnFamilyPath.getName();
final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
final Job job = new Job(jc);
setCompressOutput(job, isCompressed);
setOutputPath(job, finalOutPath);
// Create the HFile writer
final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), progressable);
final Path outputdir = FileOutputFormat.getOutputPath(tac);
final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(tac);
// Individual columns are going to be pivoted to HBase cells,
// and for each row, they need to be written out in order
// of column name, so sort the column names now, creating a
// mapping to their column position. However, the first
// column is interpreted as the row key.
String columnList = tableProperties.getProperty("columns");
String[] columnArray = columnList.split(",");
final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
int i = 0;
for (String columnName : columnArray) {
if (i != 0) {
columnMap.put(Bytes.toBytes(columnName), i);
}
++i;
}
return new RecordWriter() {
@Override
public void close(boolean abort) throws IOException {
try {
fileWriter.close(null);
if (abort) {
return;
}
// Move the hfiles file(s) from the task output directory to the
// location specified by the user.
FileSystem fs = outputdir.getFileSystem(jc);
fs.mkdirs(columnFamilyPath);
Path srcDir = taskAttemptOutputdir;
for (; ; ) {
FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
if ((files == null) || (files.length == 0)) {
throw new IOException("No family directories found in " + srcDir);
}
if (files.length != 1) {
throw new IOException("Multiple family directories found in " + srcDir);
}
srcDir = files[0].getPath();
if (srcDir.getName().equals(columnFamilyName)) {
break;
}
if (files[0].isFile()) {
throw new IOException("No family directories found in " + taskAttemptOutputdir + ". " + "The last component in hfile path should match column family name " + columnFamilyName);
}
}
for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
}
// Hive actually wants a file as task output (not a directory), so
// replace the empty directory with an empty file to keep it happy.
fs.delete(taskAttemptOutputdir, true);
fs.createNewFile(taskAttemptOutputdir);
} catch (InterruptedException ex) {
throw new IOException(ex);
}
}
private void writeText(Text text) throws IOException {
// Decompose the incoming text row into fields.
String s = text.toString();
String[] fields = s.split("");
assert (fields.length <= (columnMap.size() + 1));
// First field is the row key.
byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
// Remaining fields are cells addressed by column name within row.
for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
byte[] columnNameBytes = entry.getKey();
int iColumn = entry.getValue();
String val;
if (iColumn >= fields.length) {
// trailing blank field
val = "";
} else {
val = fields[iColumn];
if ("\\N".equals(val)) {
// omit nulls
continue;
}
}
byte[] valBytes = Bytes.toBytes(val);
KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
try {
fileWriter.write(null, kv);
} catch (IOException e) {
LOG.error("Failed while writing row: " + s);
throw e;
} catch (InterruptedException ex) {
throw new IOException(ex);
}
}
}
private void writePut(PutWritable put) throws IOException {
ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
Collections.sort(entry.getValue(), new CellComparator());
for (Cell c : entry.getValue()) {
try {
fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
} catch (InterruptedException e) {
throw (InterruptedIOException) new InterruptedIOException().initCause(e);
}
}
}
}
@Override
public void write(Writable w) throws IOException {
if (w instanceof Text) {
writeText((Text) w);
} else if (w instanceof PutWritable) {
writePut((PutWritable) w);
} else {
throw new IOException("Unexpected writable " + w);
}
}
};
}
use of java.util.SortedMap in project hive by apache.
the class MockUtils method init.
static HBaseStore init(Configuration conf, HTableInterface htable, final SortedMap<String, Cell> rows) throws IOException {
((HiveConf) conf).setVar(ConfVars.METASTORE_EXPRESSION_PROXY_CLASS, NOOPProxy.class.getName());
Mockito.when(htable.get(Mockito.any(Get.class))).thenAnswer(new Answer<Result>() {
@Override
public Result answer(InvocationOnMock invocation) throws Throwable {
Get get = (Get) invocation.getArguments()[0];
Cell cell = rows.get(new String(get.getRow()));
if (cell == null) {
return new Result();
} else {
return Result.create(new Cell[] { cell });
}
}
});
Mockito.when(htable.get(Mockito.anyListOf(Get.class))).thenAnswer(new Answer<Result[]>() {
@Override
public Result[] answer(InvocationOnMock invocation) throws Throwable {
@SuppressWarnings("unchecked") List<Get> gets = (List<Get>) invocation.getArguments()[0];
Result[] results = new Result[gets.size()];
for (int i = 0; i < gets.size(); i++) {
Cell cell = rows.get(new String(gets.get(i).getRow()));
Result result;
if (cell == null) {
result = new Result();
} else {
result = Result.create(new Cell[] { cell });
}
results[i] = result;
}
return results;
}
});
Mockito.when(htable.getScanner(Mockito.any(Scan.class))).thenAnswer(new Answer<ResultScanner>() {
@Override
public ResultScanner answer(InvocationOnMock invocation) throws Throwable {
Scan scan = (Scan) invocation.getArguments()[0];
List<Result> results = new ArrayList<Result>();
String start = new String(scan.getStartRow());
String stop = new String(scan.getStopRow());
SortedMap<String, Cell> sub = rows.subMap(start, stop);
for (Map.Entry<String, Cell> e : sub.entrySet()) {
results.add(Result.create(new Cell[] { e.getValue() }));
}
final Iterator<Result> iter = results.iterator();
return new ResultScanner() {
@Override
public Result next() throws IOException {
return null;
}
@Override
public Result[] next(int nbRows) throws IOException {
return new Result[0];
}
@Override
public void close() {
}
@Override
public Iterator<Result> iterator() {
return iter;
}
};
}
});
Mockito.doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
Put put = (Put) invocation.getArguments()[0];
rows.put(new String(put.getRow()), put.getFamilyCellMap().firstEntry().getValue().get(0));
return null;
}
}).when(htable).put(Mockito.any(Put.class));
Mockito.when(htable.checkAndPut(Mockito.any(byte[].class), Mockito.any(byte[].class), Mockito.any(byte[].class), Mockito.any(byte[].class), Mockito.any(Put.class))).thenAnswer(new Answer<Boolean>() {
@Override
public Boolean answer(InvocationOnMock invocation) throws Throwable {
// Always say it succeeded and overwrite
Put put = (Put) invocation.getArguments()[4];
rows.put(new String(put.getRow()), put.getFamilyCellMap().firstEntry().getValue().get(0));
return true;
}
});
Mockito.doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
Delete del = (Delete) invocation.getArguments()[0];
rows.remove(new String(del.getRow()));
return null;
}
}).when(htable).delete(Mockito.any(Delete.class));
Mockito.when(htable.checkAndDelete(Mockito.any(byte[].class), Mockito.any(byte[].class), Mockito.any(byte[].class), Mockito.any(byte[].class), Mockito.any(Delete.class))).thenAnswer(new Answer<Boolean>() {
@Override
public Boolean answer(InvocationOnMock invocation) throws Throwable {
// Always say it succeeded
Delete del = (Delete) invocation.getArguments()[4];
rows.remove(new String(del.getRow()));
return true;
}
});
// Mock connection
HBaseConnection hconn = Mockito.mock(HBaseConnection.class);
Mockito.when(hconn.getHBaseTable(Mockito.anyString())).thenReturn(htable);
HiveConf.setVar(conf, HiveConf.ConfVars.METASTORE_HBASE_CONNECTION_CLASS, HBaseReadWrite.TEST_CONN);
HBaseReadWrite.setTestConnection(hconn);
HBaseReadWrite.setConf(conf);
HBaseStore store = new HBaseStore();
store.setConf(conf);
return store;
}
use of java.util.SortedMap in project hive by apache.
the class HBaseUtils method hashStorageDescriptor.
/**
* Produce a hash for the storage descriptor
* @param sd storage descriptor to hash
* @param md message descriptor to use to generate the hash
* @return the hash as a byte array
*/
static byte[] hashStorageDescriptor(StorageDescriptor sd, MessageDigest md) {
// Note all maps and lists have to be absolutely sorted. Otherwise we'll produce different
// results for hashes based on the OS or JVM being used.
md.reset();
for (FieldSchema fs : sd.getCols()) {
md.update(fs.getName().getBytes(ENCODING));
md.update(fs.getType().getBytes(ENCODING));
if (fs.getComment() != null)
md.update(fs.getComment().getBytes(ENCODING));
}
if (sd.getInputFormat() != null) {
md.update(sd.getInputFormat().getBytes(ENCODING));
}
if (sd.getOutputFormat() != null) {
md.update(sd.getOutputFormat().getBytes(ENCODING));
}
md.update(sd.isCompressed() ? "true".getBytes(ENCODING) : "false".getBytes(ENCODING));
md.update(Integer.toString(sd.getNumBuckets()).getBytes(ENCODING));
if (sd.getSerdeInfo() != null) {
SerDeInfo serde = sd.getSerdeInfo();
if (serde.getName() != null) {
md.update(serde.getName().getBytes(ENCODING));
}
if (serde.getSerializationLib() != null) {
md.update(serde.getSerializationLib().getBytes(ENCODING));
}
if (serde.getParameters() != null) {
SortedMap<String, String> params = new TreeMap<>(serde.getParameters());
for (Map.Entry<String, String> param : params.entrySet()) {
md.update(param.getKey().getBytes(ENCODING));
md.update(param.getValue().getBytes(ENCODING));
}
}
}
if (sd.getBucketCols() != null) {
SortedSet<String> bucketCols = new TreeSet<>(sd.getBucketCols());
for (String bucket : bucketCols) md.update(bucket.getBytes(ENCODING));
}
if (sd.getSortCols() != null) {
SortedSet<Order> orders = new TreeSet<>(sd.getSortCols());
for (Order order : orders) {
md.update(order.getCol().getBytes(ENCODING));
md.update(Integer.toString(order.getOrder()).getBytes(ENCODING));
}
}
if (sd.getSkewedInfo() != null) {
SkewedInfo skewed = sd.getSkewedInfo();
if (skewed.getSkewedColNames() != null) {
SortedSet<String> colnames = new TreeSet<>(skewed.getSkewedColNames());
for (String colname : colnames) md.update(colname.getBytes(ENCODING));
}
if (skewed.getSkewedColValues() != null) {
SortedSet<String> sortedOuterList = new TreeSet<>();
for (List<String> innerList : skewed.getSkewedColValues()) {
SortedSet<String> sortedInnerList = new TreeSet<>(innerList);
sortedOuterList.add(StringUtils.join(sortedInnerList, "."));
}
for (String colval : sortedOuterList) md.update(colval.getBytes(ENCODING));
}
if (skewed.getSkewedColValueLocationMaps() != null) {
SortedMap<String, String> sortedMap = new TreeMap<>();
for (Map.Entry<List<String>, String> smap : skewed.getSkewedColValueLocationMaps().entrySet()) {
SortedSet<String> sortedKey = new TreeSet<>(smap.getKey());
sortedMap.put(StringUtils.join(sortedKey, "."), smap.getValue());
}
for (Map.Entry<String, String> e : sortedMap.entrySet()) {
md.update(e.getKey().getBytes(ENCODING));
md.update(e.getValue().getBytes(ENCODING));
}
}
}
return md.digest();
}
use of java.util.SortedMap in project hive by apache.
the class HiveRelDecorrelator method decorrelateRel.
/**
* Rewrite LogicalProject.
*
* @param rel the project rel to rewrite
*/
public Frame decorrelateRel(LogicalProject rel) throws SemanticException {
//
// Rewrite logic:
//
// 1. Pass along any correlated variables coming from the input.
//
final RelNode oldInput = rel.getInput();
Frame frame = getInvoke(oldInput, rel);
if (frame == null) {
// If input has not been rewritten, do not rewrite this rel.
return null;
}
final List<RexNode> oldProjects = rel.getProjects();
final List<RelDataTypeField> relOutput = rel.getRowType().getFieldList();
// LogicalProject projects the original expressions,
// plus any correlated variables the input wants to pass along.
final List<Pair<RexNode, String>> projects = Lists.newArrayList();
// and produce the correlated variables in the new output.
if (cm.mapRefRelToCorRef.containsKey(rel)) {
frame = decorrelateInputWithValueGenerator(rel);
}
// LogicalProject projects the original expressions
final Map<Integer, Integer> mapOldToNewOutputs = new HashMap<>();
int newPos;
for (newPos = 0; newPos < oldProjects.size(); newPos++) {
projects.add(newPos, Pair.of(decorrelateExpr(oldProjects.get(newPos)), relOutput.get(newPos).getName()));
mapOldToNewOutputs.put(newPos, newPos);
}
// Project any correlated variables the input wants to pass along.
final SortedMap<CorDef, Integer> corDefOutputs = new TreeMap<>();
for (Map.Entry<CorDef, Integer> entry : frame.corDefOutputs.entrySet()) {
projects.add(RexInputRef.of2(entry.getValue(), frame.r.getRowType().getFieldList()));
corDefOutputs.put(entry.getKey(), newPos);
newPos++;
}
RelNode newProject = HiveProject.create(frame.r, Pair.left(projects), Pair.right(projects));
return register(rel, newProject, mapOldToNewOutputs, corDefOutputs);
}
use of java.util.SortedMap in project hive by apache.
the class HiveRelDecorrelator method decorrelateRel.
public Frame decorrelateRel(HiveJoin rel) throws SemanticException {
//
// Rewrite logic:
//
// 1. rewrite join condition.
// 2. map output positions and produce cor vars if any.
//
final RelNode oldLeft = rel.getInput(0);
final RelNode oldRight = rel.getInput(1);
final Frame leftFrame = getInvoke(oldLeft, rel);
final Frame rightFrame = getInvoke(oldRight, rel);
if (leftFrame == null || rightFrame == null) {
// If any input has not been rewritten, do not rewrite this rel.
return null;
}
final RelNode newJoin = HiveJoin.getJoin(rel.getCluster(), leftFrame.r, rightFrame.r, decorrelateExpr(rel.getCondition()), rel.getJoinType());
// Create the mapping between the output of the old correlation rel
// and the new join rel
Map<Integer, Integer> mapOldToNewOutputs = Maps.newHashMap();
int oldLeftFieldCount = oldLeft.getRowType().getFieldCount();
int newLeftFieldCount = leftFrame.r.getRowType().getFieldCount();
int oldRightFieldCount = oldRight.getRowType().getFieldCount();
assert rel.getRowType().getFieldCount() == oldLeftFieldCount + oldRightFieldCount;
// Left input positions are not changed.
mapOldToNewOutputs.putAll(leftFrame.oldToNewOutputs);
// Right input positions are shifted by newLeftFieldCount.
for (int i = 0; i < oldRightFieldCount; i++) {
mapOldToNewOutputs.put(i + oldLeftFieldCount, rightFrame.oldToNewOutputs.get(i) + newLeftFieldCount);
}
final SortedMap<CorDef, Integer> corDefOutputs = new TreeMap<>(leftFrame.corDefOutputs);
// Right input positions are shifted by newLeftFieldCount.
for (Map.Entry<CorDef, Integer> entry : rightFrame.corDefOutputs.entrySet()) {
corDefOutputs.put(entry.getKey(), entry.getValue() + newLeftFieldCount);
}
return register(rel, newJoin, mapOldToNewOutputs, corDefOutputs);
}
Aggregations