Search in sources :

Example 71 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class SequenceFileAsBinaryOutputFormat method getRecordWriter.

@Override
public RecordWriter<BytesWritable, BytesWritable> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
    // get the path of the temporary output file 
    Path file = FileOutputFormat.getTaskOutputPath(job, name);
    FileSystem fs = file.getFileSystem(job);
    CompressionCodec codec = null;
    CompressionType compressionType = CompressionType.NONE;
    if (getCompressOutput(job)) {
        // find the kind of compression to do
        compressionType = getOutputCompressionType(job);
        // find the right codec
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, DefaultCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, job);
    }
    final SequenceFile.Writer out = SequenceFile.createWriter(fs, job, file, getSequenceFileOutputKeyClass(job), getSequenceFileOutputValueClass(job), compressionType, codec, progress);
    return new RecordWriter<BytesWritable, BytesWritable>() {

        private WritableValueBytes wvaluebytes = new WritableValueBytes();

        public void write(BytesWritable bkey, BytesWritable bvalue) throws IOException {
            wvaluebytes.reset(bvalue);
            out.appendRaw(bkey.getBytes(), 0, bkey.getLength(), wvaluebytes);
            wvaluebytes.reset(null);
        }

        public void close(Reporter reporter) throws IOException {
            out.close();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) BytesWritable(org.apache.hadoop.io.BytesWritable) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType)

Example 72 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestTFileSeek method createTFile.

private void createTFile() throws IOException {
    long totalBytes = 0;
    FSDataOutputStream fout = createFSOutput(path, fs);
    try {
        Writer writer = new Writer(fout, options.minBlockSize, options.compress, "memcmp", conf);
        try {
            BytesWritable key = new BytesWritable();
            BytesWritable val = new BytesWritable();
            timer.start();
            for (long i = 0; true; ++i) {
                if (i % 1000 == 0) {
                    // test the size for every 1000 rows.
                    if (fs.getFileStatus(path).getLen() >= options.fileSize) {
                        break;
                    }
                }
                kvGen.next(key, val, false);
                writer.append(key.getBytes(), 0, key.getLength(), val.getBytes(), 0, val.getLength());
                totalBytes += key.getLength();
                totalBytes += val.getLength();
            }
            timer.stop();
        } finally {
            writer.close();
        }
    } finally {
        fout.close();
    }
    // in us.
    double duration = (double) timer.read() / 1000;
    long fsize = fs.getFileStatus(path).getLen();
    System.out.printf("time: %s...uncompressed: %.2fMB...raw thrpt: %.2fMB/s\n", timer.toString(), (double) totalBytes / 1024 / 1024, totalBytes / duration);
    System.out.printf("time: %s...file size: %.2fMB...disk thrpt: %.2fMB/s\n", timer.toString(), (double) fsize / 1024 / 1024, fsize / duration);
}
Also used : BytesWritable(org.apache.hadoop.io.BytesWritable) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hadoop.io.file.tfile.TFile.Writer)

Example 73 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestTFileSeek method seekTFile.

public void seekTFile() throws IOException {
    int miss = 0;
    long totalBytes = 0;
    FSDataInputStream fsdis = fs.open(path);
    Reader reader = new Reader(fsdis, fs.getFileStatus(path).getLen(), conf);
    KeySampler kSampler = new KeySampler(rng, reader.getFirstKey(), reader.getLastKey(), keyLenGen);
    Scanner scanner = reader.createScanner();
    BytesWritable key = new BytesWritable();
    BytesWritable val = new BytesWritable();
    timer.reset();
    timer.start();
    for (int i = 0; i < options.seekCount; ++i) {
        kSampler.next(key);
        scanner.lowerBound(key.getBytes(), 0, key.getLength());
        if (!scanner.atEnd()) {
            scanner.entry().get(key, val);
            totalBytes += key.getLength();
            totalBytes += val.getLength();
        } else {
            ++miss;
        }
    }
    timer.stop();
    // in us.
    double duration = (double) timer.read() / 1000;
    System.out.printf("time: %s...avg seek: %s...%d hit...%d miss...avg I/O size: %.2fKB\n", timer.toString(), NanoTimer.nanoTimeToString(timer.read() / options.seekCount), options.seekCount - miss, miss, (double) totalBytes / 1024 / (options.seekCount - miss));
}
Also used : Scanner(org.apache.hadoop.io.file.tfile.TFile.Reader.Scanner) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Reader(org.apache.hadoop.io.file.tfile.TFile.Reader) BytesWritable(org.apache.hadoop.io.BytesWritable)

Example 74 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project Solbase by Photobucket.

the class SolbaseInitialIndexMapper method map.

protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
    context.getCounter(Counters.TOTAL_ROWS).increment(1);
    context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
    // global id is user_media row key
    String globalId = Bytes.toString(row.get());
    Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
    byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
    if (doc == null) {
        // validation must have failed if it returned null
        return;
    }
    // exists already
    if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
        docId = SolbaseUtil.generateUniqId();
        this.idCounter = 0;
    } else {
        docId--;
    }
    // for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
    // it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
    indexerUtil.addFieldToDoc(doc, "docId", docId + "");
    // incrementing chunking sequence (lucene doc id)
    this.idCounter++;
    try {
        ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
        List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
        MapWritable mapWritable = new MapWritable();
        DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
        mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
        for (TermDocMetadata metadata : metadatas) {
            byte[] key = metadata.getFieldTermKey();
            ByteBuffer buf = metadata.serialize();
            TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
            mapWritable.put(new BytesWritable(key), writable);
        }
        context.write(new BytesWritable(checksum), mapWritable);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) TermDocMetadata(org.solbase.lucenehbase.TermDocMetadata) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Document(org.apache.lucene.document.Document) ByteBuffer(java.nio.ByteBuffer) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Example 75 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project Solbase by Photobucket.

the class SolbaseIndexReducer method reduce.

public void reduce(BytesWritable key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException {
    byte[] _key = null;
    int counter = 0;
    int dupCount = 0;
    // since key is checksum, we should do dedupping here
    // TODO: for now, i'm only retrieving one and ignoring rest
    boolean first = true;
    for (MapWritable writable : values) {
        if (first) {
            first = false;
            Iterator<Writable> itr = writable.keySet().iterator();
            while (itr.hasNext()) {
                BytesWritable wrtKey = (BytesWritable) itr.next();
                Writable wrt = writable.get(wrtKey);
                if (wrt instanceof DocumentPutWritable) {
                    DocumentPutWritable docBytes = (DocumentPutWritable) wrt;
                    String globalId = docBytes.getGlobalId();
                    int docId = docBytes.getDocId();
                    Put mapping = new Put(Bytes.toBytes(globalId));
                    mapping.add(Bytes.toBytes("docId"), Bytes.toBytes(""), Bytes.toBytes(docId));
                    context.write(new ImmutableBytesWritable(SolbaseUtil.docKeyIdMapTable), mapping);
                    context.getCounter(Counters.TOTAL_DOC_KEY_ID_MAP).increment(1);
                    List<String> fieldKeys = docBytes.getFieldKeys();
                    List<byte[]> fieldValues = docBytes.getFieldValues();
                    List<Term> allTerms = docBytes.getAllTerms();
                    byte[] md5DocId = SolbaseUtil.randomize(docId);
                    Put documentPut = new Put(md5DocId);
                    // Store each field as a column under this docId
                    for (int i = 0; i < fieldKeys.size(); i++) {
                        String fieldKey = fieldKeys.get(i);
                        byte[] fieldValue = fieldValues.get(i);
                        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(fieldKey), fieldValue);
                    }
                    // Finally, Store meta-data so we can delete this
                    // document
                    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allTerms).array());
                    context.write(new ImmutableBytesWritable(SolbaseUtil.docTable), documentPut);
                    context.getCounter(Counters.TOTAL_DOCS).increment(1);
                    counter++;
                } else if (wrt instanceof TermDocMetadataWritable) {
                    // gather all of docs given field key (field/value)
                    TermDocMetadataWritable metadata = (TermDocMetadataWritable) wrt;
                    // convert key to byte array
                    // byte[] fieldTermKey = key.getBytes();
                    byte[] termValue = metadata.getTermDocMetadata();
                    _key = metadata.getFieldTermKey();
                    int docId = metadata.getDocId();
                    Put put = null;
                    switch(TermDocMetadataLoader.storageType) {
                        case KEY_ONLY:
                            {
                                put = new Put(Bytes.add(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)), termValue));
                                put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(""));
                            }
                            break;
                        case WIDE_ROW:
                            int chunkId = TermDocMetadataLoader.getChunkId(docId);
                            put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(chunkId)));
                            put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(docId), termValue);
                            break;
                        case NARROW_ROW:
                        default:
                            {
                                put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)));
                                put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), termValue);
                            }
                    }
                    context.write(new ImmutableBytesWritable(SolbaseUtil.termVectorTable), put);
                    context.getCounter(Counters.TOTAL_TERM_VECTORS).increment(1);
                    counter++;
                } else {
                    System.out.println("else: " + writable.getClass());
                    context.getCounter(Counters.TOTAL_INVALID).increment(1);
                }
            }
        } else {
            dupCount++;
        }
    }
    context.getCounter(Counters.DUPLICATE_ROWS).increment(dupCount);
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Writable(org.apache.hadoop.io.Writable) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable) MapWritable(org.apache.hadoop.io.MapWritable) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Term(org.apache.lucene.index.Term) Put(org.apache.hadoop.hbase.client.Put) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Aggregations

BytesWritable (org.apache.hadoop.io.BytesWritable)275 Text (org.apache.hadoop.io.Text)73 LongWritable (org.apache.hadoop.io.LongWritable)59 Test (org.junit.Test)53 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)46 IntWritable (org.apache.hadoop.io.IntWritable)44 ArrayList (java.util.ArrayList)39 Path (org.apache.hadoop.fs.Path)38 IOException (java.io.IOException)36 Configuration (org.apache.hadoop.conf.Configuration)33 FloatWritable (org.apache.hadoop.io.FloatWritable)33 Writable (org.apache.hadoop.io.Writable)32 BooleanWritable (org.apache.hadoop.io.BooleanWritable)31 List (java.util.List)30 SequenceFile (org.apache.hadoop.io.SequenceFile)27 Random (java.util.Random)24 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)24 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)23 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)22 FileSystem (org.apache.hadoop.fs.FileSystem)21