Search in sources :

Example 21 with ImmutableBytesWritable

use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hbase by apache.

the class HFileOutputFormat2 method writePartitions.

/**
   * Write out a {@link SequenceFile} that can be read by
   * {@link TotalOrderPartitioner} that contains the split points in startKeys.
   */
@SuppressWarnings("deprecation")
private static void writePartitions(Configuration conf, Path partitionsPath, List<ImmutableBytesWritable> startKeys) throws IOException {
    LOG.info("Writing partition information to " + partitionsPath);
    if (startKeys.isEmpty()) {
        throw new IllegalArgumentException("No regions passed");
    }
    // We're generating a list of split points, and we don't ever
    // have keys < the first region (which has an empty start key)
    // so we need to remove it. Otherwise we would end up with an
    // empty reducer with index 0
    TreeSet<ImmutableBytesWritable> sorted = new TreeSet<>(startKeys);
    ImmutableBytesWritable first = sorted.first();
    if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) {
        throw new IllegalArgumentException("First region of table should have empty start key. Instead has: " + Bytes.toStringBinary(first.get()));
    }
    sorted.remove(first);
    // Write the actual file
    FileSystem fs = partitionsPath.getFileSystem(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class);
    try {
        for (ImmutableBytesWritable startKey : sorted) {
            writer.append(startKey, NullWritable.get());
        }
    } finally {
        writer.close();
    }
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) TreeSet(java.util.TreeSet) FileSystem(org.apache.hadoop.fs.FileSystem) HFileSystem(org.apache.hadoop.hbase.fs.HFileSystem)

Example 22 with ImmutableBytesWritable

use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project Solbase by Photobucket.

the class SolbaseInitialIndexMapper method map.

protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
    context.getCounter(Counters.TOTAL_ROWS).increment(1);
    context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
    // global id is user_media row key
    String globalId = Bytes.toString(row.get());
    Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
    byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
    if (doc == null) {
        // validation must have failed if it returned null
        return;
    }
    // exists already
    if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
        docId = SolbaseUtil.generateUniqId();
        this.idCounter = 0;
    } else {
        docId--;
    }
    // for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
    // it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
    indexerUtil.addFieldToDoc(doc, "docId", docId + "");
    // incrementing chunking sequence (lucene doc id)
    this.idCounter++;
    try {
        ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
        List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
        MapWritable mapWritable = new MapWritable();
        DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
        mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
        for (TermDocMetadata metadata : metadatas) {
            byte[] key = metadata.getFieldTermKey();
            ByteBuffer buf = metadata.serialize();
            TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
            mapWritable.put(new BytesWritable(key), writable);
        }
        context.write(new BytesWritable(checksum), mapWritable);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) TermDocMetadata(org.solbase.lucenehbase.TermDocMetadata) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Document(org.apache.lucene.document.Document) ByteBuffer(java.nio.ByteBuffer) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Example 23 with ImmutableBytesWritable

use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project Solbase by Photobucket.

the class SolbaseIndexReducer method reduce.

public void reduce(BytesWritable key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException {
    byte[] _key = null;
    int counter = 0;
    int dupCount = 0;
    // since key is checksum, we should do dedupping here
    // TODO: for now, i'm only retrieving one and ignoring rest
    boolean first = true;
    for (MapWritable writable : values) {
        if (first) {
            first = false;
            Iterator<Writable> itr = writable.keySet().iterator();
            while (itr.hasNext()) {
                BytesWritable wrtKey = (BytesWritable) itr.next();
                Writable wrt = writable.get(wrtKey);
                if (wrt instanceof DocumentPutWritable) {
                    DocumentPutWritable docBytes = (DocumentPutWritable) wrt;
                    String globalId = docBytes.getGlobalId();
                    int docId = docBytes.getDocId();
                    Put mapping = new Put(Bytes.toBytes(globalId));
                    mapping.add(Bytes.toBytes("docId"), Bytes.toBytes(""), Bytes.toBytes(docId));
                    context.write(new ImmutableBytesWritable(SolbaseUtil.docKeyIdMapTable), mapping);
                    context.getCounter(Counters.TOTAL_DOC_KEY_ID_MAP).increment(1);
                    List<String> fieldKeys = docBytes.getFieldKeys();
                    List<byte[]> fieldValues = docBytes.getFieldValues();
                    List<Term> allTerms = docBytes.getAllTerms();
                    byte[] md5DocId = SolbaseUtil.randomize(docId);
                    Put documentPut = new Put(md5DocId);
                    // Store each field as a column under this docId
                    for (int i = 0; i < fieldKeys.size(); i++) {
                        String fieldKey = fieldKeys.get(i);
                        byte[] fieldValue = fieldValues.get(i);
                        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(fieldKey), fieldValue);
                    }
                    // Finally, Store meta-data so we can delete this
                    // document
                    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allTerms).array());
                    context.write(new ImmutableBytesWritable(SolbaseUtil.docTable), documentPut);
                    context.getCounter(Counters.TOTAL_DOCS).increment(1);
                    counter++;
                } else if (wrt instanceof TermDocMetadataWritable) {
                    // gather all of docs given field key (field/value)
                    TermDocMetadataWritable metadata = (TermDocMetadataWritable) wrt;
                    // convert key to byte array
                    // byte[] fieldTermKey = key.getBytes();
                    byte[] termValue = metadata.getTermDocMetadata();
                    _key = metadata.getFieldTermKey();
                    int docId = metadata.getDocId();
                    Put put = null;
                    switch(TermDocMetadataLoader.storageType) {
                        case KEY_ONLY:
                            {
                                put = new Put(Bytes.add(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)), termValue));
                                put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(""));
                            }
                            break;
                        case WIDE_ROW:
                            int chunkId = TermDocMetadataLoader.getChunkId(docId);
                            put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(chunkId)));
                            put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(docId), termValue);
                            break;
                        case NARROW_ROW:
                        default:
                            {
                                put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)));
                                put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), termValue);
                            }
                    }
                    context.write(new ImmutableBytesWritable(SolbaseUtil.termVectorTable), put);
                    context.getCounter(Counters.TOTAL_TERM_VECTORS).increment(1);
                    counter++;
                } else {
                    System.out.println("else: " + writable.getClass());
                    context.getCounter(Counters.TOTAL_INVALID).increment(1);
                }
            }
        } else {
            dupCount++;
        }
    }
    context.getCounter(Counters.DUPLICATE_ROWS).increment(dupCount);
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Writable(org.apache.hadoop.io.Writable) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable) MapWritable(org.apache.hadoop.io.MapWritable) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Term(org.apache.lucene.index.Term) Put(org.apache.hadoop.hbase.client.Put) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Example 24 with ImmutableBytesWritable

use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project SpyGlass by ParallelAI.

the class HBaseScheme method sink.

@Override
public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
    OutputCollector outputCollector = sinkCall.getOutput();
    Tuple key = tupleEntry.selectTuple(keyField);
    ImmutableBytesWritable keyBytes = (ImmutableBytesWritable) key.getObject(0);
    if (useSalt) {
        keyBytes = HBaseSalter.addSaltPrefix(keyBytes);
    }
    Put put;
    if (this.timeStamp == 0L) {
        put = new Put(keyBytes.get());
    } else {
        put = new Put(keyBytes.get(), this.timeStamp);
    }
    for (int i = 0; i < valueFields.length; i++) {
        Fields fieldSelector = valueFields[i];
        TupleEntry values = tupleEntry.selectEntry(fieldSelector);
        for (int j = 0; j < values.getFields().size(); j++) {
            Fields fields = values.getFields();
            Tuple tuple = values.getTuple();
            ImmutableBytesWritable valueBytes = (ImmutableBytesWritable) tuple.getObject(j);
            if (valueBytes != null)
                put.add(Bytes.toBytes(familyNames[i]), Bytes.toBytes((String) fields.get(j)), valueBytes.get());
        }
    }
    outputCollector.collect(null, put);
}
Also used : OutputCollector(org.apache.hadoop.mapred.OutputCollector) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Fields(cascading.tuple.Fields) TupleEntry(cascading.tuple.TupleEntry) Tuple(cascading.tuple.Tuple) Put(org.apache.hadoop.hbase.client.Put)

Example 25 with ImmutableBytesWritable

use of org.apache.hadoop.hbase.io.ImmutableBytesWritable in project hive by apache.

the class HiveHBaseTableInputFormat method getRecordReader.

@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException {
    HBaseSplit hbaseSplit = (HBaseSplit) split;
    TableSplit tableSplit = hbaseSplit.getTableSplit();
    if (conn == null) {
        conn = ConnectionFactory.createConnection(HBaseConfiguration.create(jobConf));
    }
    initializeTable(conn, tableSplit.getTable());
    setScan(HiveHBaseInputFormatUtil.getScan(jobConf));
    Job job = new Job(jobConf);
    TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);
    final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader(tableSplit, tac);
    try {
        recordReader.initialize(tableSplit, tac);
    } catch (InterruptedException e) {
        // Free up the HTable connections
        closeTable();
        if (conn != null) {
            conn.close();
            conn = null;
        }
        throw new IOException("Failed to initialize RecordReader", e);
    }
    return new RecordReader<ImmutableBytesWritable, ResultWritable>() {

        @Override
        public void close() throws IOException {
            recordReader.close();
            closeTable();
            if (conn != null) {
                conn.close();
                conn = null;
            }
        }

        @Override
        public ImmutableBytesWritable createKey() {
            return new ImmutableBytesWritable();
        }

        @Override
        public ResultWritable createValue() {
            return new ResultWritable(new Result());
        }

        @Override
        public long getPos() throws IOException {
            return 0;
        }

        @Override
        public float getProgress() throws IOException {
            float progress = 0.0F;
            try {
                progress = recordReader.getProgress();
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
            return progress;
        }

        @Override
        public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException {
            boolean next = false;
            try {
                next = recordReader.nextKeyValue();
                if (next) {
                    rowKey.set(recordReader.getCurrentValue().getRow());
                    value.setResult(recordReader.getCurrentValue());
                }
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
            return next;
        }
    };
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) RecordReader(org.apache.hadoop.mapred.RecordReader) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) Result(org.apache.hadoop.hbase.client.Result) TableSplit(org.apache.hadoop.hbase.mapreduce.TableSplit) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)264 Test (org.junit.Test)80 Expression (org.apache.phoenix.expression.Expression)32 IOException (java.io.IOException)26 PSmallint (org.apache.phoenix.schema.types.PSmallint)25 Result (org.apache.hadoop.hbase.client.Result)24 PTable (org.apache.phoenix.schema.PTable)24 ArrayList (java.util.ArrayList)23 Cell (org.apache.hadoop.hbase.Cell)23 KeyValue (org.apache.hadoop.hbase.KeyValue)23 LiteralExpression (org.apache.phoenix.expression.LiteralExpression)23 PTinyint (org.apache.phoenix.schema.types.PTinyint)23 PhoenixArray (org.apache.phoenix.schema.types.PhoenixArray)23 Configuration (org.apache.hadoop.conf.Configuration)20 PDataType (org.apache.phoenix.schema.types.PDataType)20 PUnsignedSmallint (org.apache.phoenix.schema.types.PUnsignedSmallint)20 PUnsignedTinyint (org.apache.phoenix.schema.types.PUnsignedTinyint)20 List (java.util.List)19 Put (org.apache.hadoop.hbase.client.Put)19 SQLException (java.sql.SQLException)18