use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hbase by apache.
the class HFileOutputFormat2 method writePartitions.
/**
* Write out a {@link SequenceFile} that can be read by
* {@link TotalOrderPartitioner} that contains the split points in startKeys.
*/
@SuppressWarnings("deprecation")
private static void writePartitions(Configuration conf, Path partitionsPath, List<ImmutableBytesWritable> startKeys) throws IOException {
LOG.info("Writing partition information to " + partitionsPath);
if (startKeys.isEmpty()) {
throw new IllegalArgumentException("No regions passed");
}
// We're generating a list of split points, and we don't ever
// have keys < the first region (which has an empty start key)
// so we need to remove it. Otherwise we would end up with an
// empty reducer with index 0
TreeSet<ImmutableBytesWritable> sorted = new TreeSet<>(startKeys);
ImmutableBytesWritable first = sorted.first();
if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
throw new IllegalArgumentException("First region of table should have empty start key. Instead has: " + Bytes.toStringBinary(first.get()));
}
sorted.remove(first);
// Write the actual file
FileSystem fs = partitionsPath.getFileSystem(conf);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
try {
for (ImmutableBytesWritable startKey : sorted) {
writer.append(startKey, NullWritable.get());
}
} finally {
writer.close();
}
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project Solbase by Photobucket.
the class SolbaseInitialIndexMapper method map.
protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
context.getCounter(Counters.TOTAL_ROWS).increment(1);
context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
// global id is user_media row key
String globalId = Bytes.toString(row.get());
Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
if (doc == null) {
// validation must have failed if it returned null
return;
}
// exists already
if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
docId = SolbaseUtil.generateUniqId();
this.idCounter = 0;
} else {
docId--;
}
// for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
// it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
indexerUtil.addFieldToDoc(doc, "docId", docId + "");
// incrementing chunking sequence (lucene doc id)
this.idCounter++;
try {
ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
MapWritable mapWritable = new MapWritable();
DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
for (TermDocMetadata metadata : metadatas) {
byte[] key = metadata.getFieldTermKey();
ByteBuffer buf = metadata.serialize();
TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
mapWritable.put(new BytesWritable(key), writable);
}
context.write(new BytesWritable(checksum), mapWritable);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project Solbase by Photobucket.
the class SolbaseIndexReducer method reduce.
public void reduce(BytesWritable key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException {
byte[] _key = null;
int counter = 0;
int dupCount = 0;
// since key is checksum, we should do dedupping here
// TODO: for now, i'm only retrieving one and ignoring rest
boolean first = true;
for (MapWritable writable : values) {
if (first) {
first = false;
Iterator<Writable> itr = writable.keySet().iterator();
while (itr.hasNext()) {
BytesWritable wrtKey = (BytesWritable) itr.next();
Writable wrt = writable.get(wrtKey);
if (wrt instanceof DocumentPutWritable) {
DocumentPutWritable docBytes = (DocumentPutWritable) wrt;
String globalId = docBytes.getGlobalId();
int docId = docBytes.getDocId();
Put mapping = new Put(Bytes.toBytes(globalId));
mapping.add(Bytes.toBytes("docId"), Bytes.toBytes(""), Bytes.toBytes(docId));
context.write(new ImmutableBytesWritable(SolbaseUtil.docKeyIdMapTable), mapping);
context.getCounter(Counters.TOTAL_DOC_KEY_ID_MAP).increment(1);
List<String> fieldKeys = docBytes.getFieldKeys();
List<byte[]> fieldValues = docBytes.getFieldValues();
List<Term> allTerms = docBytes.getAllTerms();
byte[] md5DocId = SolbaseUtil.randomize(docId);
Put documentPut = new Put(md5DocId);
// Store each field as a column under this docId
for (int i = 0; i < fieldKeys.size(); i++) {
String fieldKey = fieldKeys.get(i);
byte[] fieldValue = fieldValues.get(i);
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(fieldKey), fieldValue);
}
// Finally, Store meta-data so we can delete this
// document
documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allTerms).array());
context.write(new ImmutableBytesWritable(SolbaseUtil.docTable), documentPut);
context.getCounter(Counters.TOTAL_DOCS).increment(1);
counter++;
} else if (wrt instanceof TermDocMetadataWritable) {
// gather all of docs given field key (field/value)
TermDocMetadataWritable metadata = (TermDocMetadataWritable) wrt;
// convert key to byte array
// byte[] fieldTermKey = key.getBytes();
byte[] termValue = metadata.getTermDocMetadata();
_key = metadata.getFieldTermKey();
int docId = metadata.getDocId();
Put put = null;
switch(TermDocMetadataLoader.storageType) {
case KEY_ONLY:
{
put = new Put(Bytes.add(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)), termValue));
put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(""));
}
break;
case WIDE_ROW:
int chunkId = TermDocMetadataLoader.getChunkId(docId);
put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(chunkId)));
put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(docId), termValue);
break;
case NARROW_ROW:
default:
{
put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)));
put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), termValue);
}
}
context.write(new ImmutableBytesWritable(SolbaseUtil.termVectorTable), put);
context.getCounter(Counters.TOTAL_TERM_VECTORS).increment(1);
counter++;
} else {
System.out.println("else: " + writable.getClass());
context.getCounter(Counters.TOTAL_INVALID).increment(1);
}
}
} else {
dupCount++;
}
}
context.getCounter(Counters.DUPLICATE_ROWS).increment(dupCount);
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project SpyGlass by ParallelAI.
the class HBaseScheme method sink.
@Override
public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
OutputCollector outputCollector = sinkCall.getOutput();
Tuple key = tupleEntry.selectTuple(keyField);
ImmutableBytesWritable keyBytes = (ImmutableBytesWritable) key.getObject(0);
if (useSalt) {
keyBytes = HBaseSalter.addSaltPrefix(keyBytes);
}
Put put;
if (this.timeStamp == 0L) {
put = new Put(keyBytes.get());
} else {
put = new Put(keyBytes.get(), this.timeStamp);
}
for (int i = 0; i < valueFields.length; i++) {
Fields fieldSelector = valueFields[i];
TupleEntry values = tupleEntry.selectEntry(fieldSelector);
for (int j = 0; j < values.getFields().size(); j++) {
Fields fields = values.getFields();
Tuple tuple = values.getTuple();
ImmutableBytesWritable valueBytes = (ImmutableBytesWritable) tuple.getObject(j);
if (valueBytes != null)
put.add(Bytes.toBytes(familyNames[i]), Bytes.toBytes((String) fields.get(j)), valueBytes.get());
}
}
outputCollector.collect(null, put);
}
use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hive by apache.
the class HiveHBaseTableInputFormat method getRecordReader.
@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException {
HBaseSplit hbaseSplit = (HBaseSplit) split;
TableSplit tableSplit = hbaseSplit.getTableSplit();
if (conn == null) {
conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
}
initializeTable(conn, tableSplit.getTable());
setScan(HiveHBaseInputFormatUtil.getScan(jobConf));
Job job = new Job(jobConf);
TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);
final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader(tableSplit, tac);
try {
recordReader.initialize(tableSplit, tac);
} catch (InterruptedException e) {
// Free up the HTable connections
closeTable();
if (conn != null) {
conn.close();
conn = null;
}
throw new IOException("Failed to initialize RecordReader", e);
}
return new RecordReader<ImmutableBytesWritable, ResultWritable>() {
@Override
public void close() throws IOException {
recordReader.close();
closeTable();
if (conn != null) {
conn.close();
conn = null;
}
}
@Override
public ImmutableBytesWritable createKey() {
return new ImmutableBytesWritable();
}
@Override
public ResultWritable createValue() {
return new ResultWritable(new Result());
}
@Override
public long getPos() throws IOException {
return 0;
}
@Override
public float getProgress() throws IOException {
float progress = 0.0F;
try {
progress = recordReader.getProgress();
} catch (InterruptedException e) {
throw new IOException(e);
}
return progress;
}
@Override
public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException {
boolean next = false;
try {
next = recordReader.nextKeyValue();
if (next) {
rowKey.set(recordReader.getCurrentValue().getRow());
value.setResult(recordReader.getCurrentValue());
}
} catch (InterruptedException e) {
throw new IOException(e);
}
return next;
}
};
}
Aggregations