use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hbase by apache.
the class HFileOutputFormat2 method configureIncrementalLoad.
static void configureIncrementalLoad(Job job, HTableDescriptor tableDescriptor, RegionLocator regionLocator, Class<? extends OutputFormat<?, ?>> cls) throws IOException, UnsupportedEncodingException {
Configuration conf = job.getConfiguration();
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setOutputFormatClass(cls);
// TODO it would be nice to pick one or the other of these formats.
if (KeyValue.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(KeyValueSortReducer.class);
} else if (Put.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(PutSortReducer.class);
} else if (Text.class.equals(job.getMapOutputValueClass())) {
job.setReducerClass(TextSortReducer.class);
} else {
LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
}
conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName());
if (conf.getBoolean(LOCALITY_SENSITIVE_CONF_KEY, DEFAULT_LOCALITY_SENSITIVE)) {
// record this table name for creating writer by favored nodes
LOG.info("bulkload locality sensitive enabled");
conf.set(OUTPUT_TABLE_NAME_CONF_KEY, regionLocator.getName().getNameAsString());
}
// Use table's region boundaries for TOP split points.
LOG.info("Looking up current regions for table " + regionLocator.getName());
List<ImmutableBytesWritable> startKeys = getRegionStartKeys(regionLocator);
LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");
job.setNumReduceTasks(startKeys.size());
configurePartitioner(job, startKeys);
// Set compression algorithms based on column families
configureCompression(conf, tableDescriptor);
configureBloomType(tableDescriptor, conf);
configureBlockSize(tableDescriptor, conf);
configureDataBlockEncoding(tableDescriptor, conf);
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.initCredentials(job);
LOG.info("Incremental table " + regionLocator.getName() + " output configured.");
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hbase by apache.
the class MultiHFileOutputFormat method createMultiHFileRecordWriter.
static <V extends Cell> RecordWriter<ImmutableBytesWritable, V> createMultiHFileRecordWriter(final TaskAttemptContext context) throws IOException {
// Get the path of the output directory
final Path outputPath = FileOutputFormat.getOutputPath(context);
final Path outputDir = new FileOutputCommitter(outputPath, context).getWorkPath();
final Configuration conf = context.getConfiguration();
final FileSystem fs = outputDir.getFileSystem(conf);
// Map of tables to writers
final Map<ImmutableBytesWritable, RecordWriter<ImmutableBytesWritable, V>> tableWriters = new HashMap<>();
return new RecordWriter<ImmutableBytesWritable, V>() {
@Override
public void write(ImmutableBytesWritable tableName, V cell) throws IOException, InterruptedException {
RecordWriter<ImmutableBytesWritable, V> tableWriter = tableWriters.get(tableName);
// if there is new table, verify that table directory exists
if (tableWriter == null) {
// using table name as directory name
final Path tableOutputDir = new Path(outputDir, Bytes.toString(tableName.copyBytes()));
fs.mkdirs(tableOutputDir);
LOG.info("Writing Table '" + tableName.toString() + "' data into following directory" + tableOutputDir.toString());
// Create writer for one specific table
tableWriter = new HFileOutputFormat2.HFileRecordWriter<>(context, tableOutputDir);
// Put table into map
tableWriters.put(tableName, tableWriter);
}
// Write <Row, Cell> into tableWriter
// in the original code, it does not use Row
tableWriter.write(null, cell);
}
@Override
public void close(TaskAttemptContext c) throws IOException, InterruptedException {
for (RecordWriter<ImmutableBytesWritable, V> writer : tableWriters.values()) {
writer.close(c);
}
}
};
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hbase by apache.
the class HFileOutputFormat2 method writePartitions.
/**
* Write out a {@link SequenceFile} that can be read by
* {@link TotalOrderPartitioner} that contains the split points in startKeys.
*/
@SuppressWarnings("deprecation")
private static void writePartitions(Configuration conf, Path partitionsPath, List<ImmutableBytesWritable> startKeys) throws IOException {
LOG.info("Writing partition information to " + partitionsPath);
if (startKeys.isEmpty()) {
throw new IllegalArgumentException("No regions passed");
}
// We're generating a list of split points, and we don't ever
// have keys < the first region (which has an empty start key)
// so we need to remove it. Otherwise we would end up with an
// empty reducer with index 0
TreeSet<ImmutableBytesWritable> sorted = new TreeSet<>(startKeys);
ImmutableBytesWritable first = sorted.first();
if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
throw new IllegalArgumentException("First region of table should have empty start key. Instead has: " + Bytes.toStringBinary(first.get()));
}
sorted.remove(first);
// Write the actual file
FileSystem fs = partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
try {
for (ImmutableBytesWritable startKey : sorted) {
writer.append(startKey, NullWritable.get());
}
} finally {
writer.close();
}
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project tdi-studio-se by Talend.
the class HBaseStore method run.
public static void run(String zookeeperHost, String zookeeperPort, String table, final String columns, Map<String, String> properties, TalendRDD<List<Object>> rdd, final List<Integer> keyList) throws IOException {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", zookeeperHost);
conf.set("hbase.zookeeper.property.clientPort", zookeeperPort);
conf.set("hbase.mapred.tablecolumns", columns);
for (Entry<String, String> e : properties.entrySet()) {
conf.set(e.getKey(), e.getValue());
}
TalendPairRDD<ImmutableBytesWritable, Put> hbaseRdd = rdd.mapToPair(new PairFunction<List<Object>, ImmutableBytesWritable, Put>() {
private static final long serialVersionUID = 1L;
public Tuple2<ImmutableBytesWritable, Put> call(List<Object> t) throws Exception {
String key = "";
for (int i : keyList) {
key = key + t.get(i);
}
org.apache.hadoop.hbase.client.Put put = new org.apache.hadoop.hbase.client.Put(DigestUtils.md5("".equals(key) ? t.toString() : key));
String[] cols = columns.split(" ");
int i = 0;
for (Object o : t) {
if (cols.length > i) {
put.add(org.apache.hadoop.hbase.util.Bytes.toBytes(cols[i].split(":")[0]), org.apache.hadoop.hbase.util.Bytes.toBytes(cols[i].split(":")[1]), (o != null ? org.apache.hadoop.hbase.util.Bytes.toBytes(o.toString()) : null));
}
i++;
}
return new Tuple2<ImmutableBytesWritable, Put>(new ImmutableBytesWritable(), put);
}
});
JobConf config = new JobConf(conf);
config.set(TableOutputFormat.OUTPUT_TABLE, table);
config.setOutputFormat(TableOutputFormat.class);
hbaseRdd.saveAsHadoopDataset(config);
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project crunch by cloudera.
the class WordCountHBaseTest method run.
public void run(Pipeline pipeline) throws IOException {
Random rand = new Random();
int postFix = Math.abs(rand.nextInt());
String inputTableName = "crunch_words_" + postFix;
String outputTableName = "crunch_counts_" + postFix;
try {
HTable inputTable = hbaseTestUtil.createTable(Bytes.toBytes(inputTableName), WORD_COLFAM);
HTable outputTable = hbaseTestUtil.createTable(Bytes.toBytes(outputTableName), COUNTS_COLFAM);
int key = 0;
key = put(inputTable, key, "cat");
key = put(inputTable, key, "cat");
key = put(inputTable, key, "dog");
Scan scan = new Scan();
scan.addColumn(WORD_COLFAM, null);
HBaseSourceTarget source = new HBaseSourceTarget(inputTableName, scan);
PTable<ImmutableBytesWritable, Result> shakespeare = pipeline.read(source);
pipeline.write(wordCount(shakespeare), new HBaseTarget(outputTableName));
pipeline.done();
assertIsLong(outputTable, "cat", 2);
assertIsLong(outputTable, "dog", 1);
} finally {
// not quite sure...
}
}
Aggregations