use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.
the class MultiHfileOutputFormat method configureIncrementalLoad.
/**
* Configures the job for MultiHfileOutputFormat.
* @param job
* @param tablesToBeLoaded
* @throws IOException
*/
public static void configureIncrementalLoad(Job job, List<TargetTableRef> tablesToBeLoaded) throws IOException {
Configuration conf = job.getConfiguration();
job.setOutputFormatClass(MultiHfileOutputFormat.class);
conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName());
// tableStartKeys for all tables.
Set<TableRowkeyPair> tablesStartKeys = Sets.newTreeSet();
for (TargetTableRef table : tablesToBeLoaded) {
final String tableName = table.getPhysicalName();
try (HTable htable = new HTable(conf, tableName)) {
Set<TableRowkeyPair> startKeys = getRegionStartKeys(tableName, htable.getRegionLocator());
tablesStartKeys.addAll(startKeys);
String compressionConfig = configureCompression(htable.getTableDescriptor());
String bloomTypeConfig = configureBloomType(htable.getTableDescriptor());
String blockSizeConfig = configureBlockSize(htable.getTableDescriptor());
String blockEncodingConfig = configureDataBlockEncoding(htable.getTableDescriptor());
Map<String, String> tableConfigs = Maps.newHashMap();
if (StringUtils.isNotBlank(compressionConfig)) {
tableConfigs.put(COMPRESSION_FAMILIES_CONF_KEY, compressionConfig);
}
if (StringUtils.isNotBlank(bloomTypeConfig)) {
tableConfigs.put(BLOOM_TYPE_FAMILIES_CONF_KEY, bloomTypeConfig);
}
if (StringUtils.isNotBlank(blockSizeConfig)) {
tableConfigs.put(BLOCK_SIZE_FAMILIES_CONF_KEY, blockSizeConfig);
}
if (StringUtils.isNotBlank(blockEncodingConfig)) {
tableConfigs.put(DATABLOCK_ENCODING_FAMILIES_CONF_KEY, blockEncodingConfig);
}
table.setConfiguration(tableConfigs);
final String tableDefns = TargetTableRefFunctions.TO_JSON.apply(table);
// set the table definition in the config to be used during the RecordWriter..
conf.set(tableName, tableDefns);
TargetTableRef tbl = TargetTableRefFunctions.FROM_JSON.apply(tableDefns);
LOG.info(" the table logical name is " + tbl.getLogicalName());
}
}
LOG.info("Configuring " + tablesStartKeys.size() + " reduce partitions to match current region count");
job.setNumReduceTasks(tablesStartKeys.size());
configurePartitioner(job, tablesStartKeys);
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.initCredentials(job);
}
use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.
the class MultiHfileOutputFormat method getRegionStartKeys.
/**
* Return the start keys of all of the regions in this table,
* as a list of ImmutableBytesWritable.
*/
private static Set<TableRowkeyPair> getRegionStartKeys(String tableName, RegionLocator table) throws IOException {
byte[][] byteKeys = table.getStartKeys();
Set<TableRowkeyPair> ret = new TreeSet<TableRowkeyPair>();
for (byte[] byteKey : byteKeys) {
// phoenix-2216: start : passing the table name and startkey
ret.add(new TableRowkeyPair(tableName, new ImmutableBytesWritable(byteKey)));
}
return ret;
}
use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.
the class FormatToBytesWritableMapper method writeAggregatedRow.
/**
* Collect all column values for the same Row. RowKey may be different if indexes are involved,
* so it writes a separate record for each unique RowKey
*
* @param context Current mapper context
* @param tableName Table index in tableNames list
* @param lkv List of KV values that will be combined in a single ImmutableBytesWritable
* @throws IOException
* @throws InterruptedException
*/
private void writeAggregatedRow(Context context, String tableName, List<KeyValue> lkv) throws IOException, InterruptedException {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
DataOutputStream outputStream = new DataOutputStream(bos);
ImmutableBytesWritable outputKey = null;
if (!lkv.isEmpty()) {
for (KeyValue cell : lkv) {
if (outputKey == null || Bytes.compareTo(outputKey.get(), outputKey.getOffset(), outputKey.getLength(), cell.getRowArray(), cell.getRowOffset(), cell.getRowLength()) != 0) {
// This a the first RowKey or a different from previous
if (outputKey != null) {
// It's a different RowKey, so we need to write it
ImmutableBytesWritable aggregatedArray = new ImmutableBytesWritable(bos.toByteArray());
outputStream.close();
context.write(new TableRowkeyPair(tableName, outputKey), aggregatedArray);
}
outputKey = new ImmutableBytesWritable(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength());
bos = new ByteArrayOutputStream(1024);
outputStream = new DataOutputStream(bos);
}
/*
The order of aggregation: type, index of column, length of value, value itself
*/
int i = findIndex(cell);
if (i == -1) {
// we skip those KVs that are not belongs to loca index
continue;
}
outputStream.writeByte(cell.getTypeByte());
WritableUtils.writeVLong(outputStream, cell.getTimestamp());
WritableUtils.writeVInt(outputStream, i);
WritableUtils.writeVInt(outputStream, cell.getValueLength());
outputStream.write(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
}
ImmutableBytesWritable aggregatedArray = new ImmutableBytesWritable(bos.toByteArray());
outputStream.close();
context.write(new TableRowkeyPair(tableName, outputKey), aggregatedArray);
}
}
use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.
the class MultiHfileOutputFormat method writePartitions.
private static void writePartitions(Configuration conf, Path partitionsPath, Set<TableRowkeyPair> tablesStartKeys) throws IOException {
LOG.info("Writing partition information to " + partitionsPath);
if (tablesStartKeys.isEmpty()) {
throw new IllegalArgumentException("No regions passed");
}
// We're generating a list of split points, and we don't ever
// have keys < the first region (which has an empty start key)
// so we need to remove it. Otherwise we would end up with an
// empty reducer with index 0
TreeSet<TableRowkeyPair> sorted = new TreeSet<TableRowkeyPair>(tablesStartKeys);
TableRowkeyPair first = sorted.first();
if (!first.getRowkey().equals(HConstants.EMPTY_BYTE_ARRAY)) {
throw new IllegalArgumentException("First region of table should have empty start key. Instead has: " + Bytes.toStringBinary(first.getRowkey().get()));
}
sorted.remove(first);
// Write the actual file
FileSystem fs = partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, partitionsPath, TableRowkeyPair.class, NullWritable.class);
try {
for (TableRowkeyPair startKey : sorted) {
writer.append(startKey, NullWritable.get());
}
} finally {
writer.close();
}
}
use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.
the class MultiHfileOutputFormat method createRecordWriter.
/**
* @param context
* @return
* @throws IOException
*/
static <V extends Cell> RecordWriter<TableRowkeyPair, V> createRecordWriter(final TaskAttemptContext context) throws IOException {
// Get the path of the temporary output file
final Path outputPath = FileOutputFormat.getOutputPath(context);
final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
final Configuration conf = context.getConfiguration();
final FileSystem fs = outputdir.getFileSystem(conf);
final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE);
// Invented config. Add to hbase-*.xml if other than default compression.
final String defaultCompressionStr = conf.get("hfile.compression", Compression.Algorithm.NONE.getName());
final Algorithm defaultCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr);
final boolean compactionExclude = conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
return new RecordWriter<TableRowkeyPair, V>() {
// Map of families to writers and how much has been output on the writer.
private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(Bytes.BYTES_COMPARATOR);
private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;
private final byte[] now = Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis());
private boolean rollRequested = false;
@Override
public void write(TableRowkeyPair row, V cell) throws IOException {
KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
// null input == user explicitly wants to flush
if (row == null && kv == null) {
rollWriters();
return;
}
// phoenix-2216: start : extract table name from the rowkey
String tableName = row.getTableName();
byte[] rowKey = row.getRowkey().get();
long length = kv.getLength();
byte[] family = CellUtil.cloneFamily(kv);
byte[] tableAndFamily = join(tableName, Bytes.toString(family));
WriterLength wl = this.writers.get(tableAndFamily);
// If this is a new column family, verify that the directory exists
if (wl == null) {
// phoenix-2216: start : create a directory for table and family within the output dir
Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName);
fs.mkdirs(new Path(tableOutputPath, Bytes.toString(family)));
// phoenix-2216: end
}
// maxsize, we need to roll all the writers
if (wl != null && wl.written + length >= maxsize) {
this.rollRequested = true;
}
// This can only happen once a row is finished though
if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
rollWriters();
}
// create a new WAL writer, if necessary
if (wl == null || wl.writer == null) {
// phoenix-2216: start : passed even the table name
wl = getNewWriter(tableName, family, conf);
// phoenix-2216: end
}
// we now have the proper WAL writer. full steam ahead
kv.updateLatestStamp(this.now);
wl.writer.append(kv);
wl.written += length;
// Copy the row so we know when a row transition.
this.previousRow = rowKey;
}
private void rollWriters() throws IOException {
for (WriterLength wl : this.writers.values()) {
if (wl.writer != null) {
LOG.info("Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written));
close(wl.writer);
}
wl.writer = null;
wl.written = 0;
}
this.rollRequested = false;
}
/* Create a new StoreFile.Writer.
* @param family
* @return A WriterLength, containing a new StoreFile.Writer.
* @throws IOException
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "BX_UNBOXING_IMMEDIATELY_REBOXED", justification = "Not important")
private WriterLength getNewWriter(final String tableName, byte[] family, Configuration conf) throws IOException {
WriterLength wl = new WriterLength();
Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName);
Path familydir = new Path(tableOutputPath, Bytes.toString(family));
// phoenix-2216: start : fetching the configuration properties that were set to the table.
// create a map from column family to the compression algorithm for the table.
final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf, tableName);
final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf, tableName);
final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf, tableName);
// phoenix-2216: end
String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY);
final Map<byte[], DataBlockEncoding> datablockEncodingMap = createFamilyDataBlockEncodingMap(conf, tableName);
final DataBlockEncoding overriddenEncoding;
if (dataBlockEncodingStr != null) {
overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr);
} else {
overriddenEncoding = null;
}
Algorithm compression = compressionMap.get(family);
compression = compression == null ? defaultCompression : compression;
BloomType bloomType = bloomTypeMap.get(family);
bloomType = bloomType == null ? BloomType.NONE : bloomType;
Integer blockSize = blockSizeMap.get(family);
blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize;
DataBlockEncoding encoding = overriddenEncoding;
encoding = encoding == null ? datablockEncodingMap.get(family) : encoding;
encoding = encoding == null ? DataBlockEncoding.NONE : encoding;
Configuration tempConf = new Configuration(conf);
tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
HFileContextBuilder contextBuilder = new HFileContextBuilder().withCompression(compression).withChecksumType(HStore.getChecksumType(conf)).withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)).withBlockSize(blockSize);
contextBuilder.withDataBlockEncoding(encoding);
HFileContext hFileContext = contextBuilder.build();
wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs).withOutputDir(familydir).withBloomType(bloomType).withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext).build();
// join and put it in the writers map .
// phoenix-2216: start : holds a map of writers where the
// key in the map is a join byte array of table name and family.
byte[] tableAndFamily = join(tableName, Bytes.toString(family));
this.writers.put(tableAndFamily, wl);
// phoenix-2216: end
return wl;
}
private void close(final StoreFile.Writer w) throws IOException {
if (w != null) {
w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis()));
w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString()));
w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true));
w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude));
w.appendTrackedTimestampsToMetadata();
w.close();
}
}
@Override
public void close(TaskAttemptContext c) throws IOException, InterruptedException {
for (WriterLength wl : this.writers.values()) {
close(wl.writer);
}
}
};
}
Aggregations