Search in sources :

Example 1 with MapWritable

use of org.apache.hadoop.io.MapWritable in project hadoop by apache.

the class TypedBytesWritableInput method readMap.

public MapWritable readMap(MapWritable mw) throws IOException {
    if (mw == null) {
        mw = new MapWritable();
    }
    int length = in.readMapHeader();
    for (int i = 0; i < length; i++) {
        Writable key = read();
        Writable value = read();
        mw.put(key, value);
    }
    return mw;
}
Also used : SortedMapWritable(org.apache.hadoop.io.SortedMapWritable) VLongWritable(org.apache.hadoop.io.VLongWritable) Writable(org.apache.hadoop.io.Writable) MapWritable(org.apache.hadoop.io.MapWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) LongWritable(org.apache.hadoop.io.LongWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) ByteWritable(org.apache.hadoop.io.ByteWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) VIntWritable(org.apache.hadoop.io.VIntWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) SortedMapWritable(org.apache.hadoop.io.SortedMapWritable) MapWritable(org.apache.hadoop.io.MapWritable)

Example 2 with MapWritable

use of org.apache.hadoop.io.MapWritable in project accumulo by apache.

the class InputConfigurator method setInputTableConfigs.

/**
 * Sets configurations for multiple tables at a time.
 *
 * @param implementingClass
 *          the class whose name will be used as a prefix for the property configuration key
 * @param conf
 *          the Hadoop configuration object to configure
 * @param configs
 *          an array of {@link InputTableConfig} objects to associate with the job
 * @since 1.6.0
 */
public static void setInputTableConfigs(Class<?> implementingClass, Configuration conf, Map<String, InputTableConfig> configs) {
    MapWritable mapWritable = new MapWritable();
    for (Map.Entry<String, InputTableConfig> tableConfig : configs.entrySet()) mapWritable.put(new Text(tableConfig.getKey()), tableConfig.getValue());
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    try {
        mapWritable.write(new DataOutputStream(baos));
    } catch (IOException e) {
        throw new IllegalStateException("Table configuration could not be serialized.");
    }
    String confKey = enumToConfKey(implementingClass, ScanOpts.TABLE_CONFIGS);
    conf.set(confKey, Base64.getEncoder().encodeToString(baos.toByteArray()));
}
Also used : InputTableConfig(org.apache.accumulo.core.client.mapreduce.InputTableConfig) DataOutputStream(java.io.DataOutputStream) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) Map(java.util.Map) HashMap(java.util.HashMap)

Example 3 with MapWritable

use of org.apache.hadoop.io.MapWritable in project Solbase by Photobucket.

the class SolbaseIndexReducer method reduce.

public void reduce(BytesWritable key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException {
    byte[] _key = null;
    int counter = 0;
    int dupCount = 0;
    // since key is checksum, we should do dedupping here
    // TODO: for now, i'm only retrieving one and ignoring rest
    boolean first = true;
    for (MapWritable writable : values) {
        if (first) {
            first = false;
            Iterator<Writable> itr = writable.keySet().iterator();
            while (itr.hasNext()) {
                BytesWritable wrtKey = (BytesWritable) itr.next();
                Writable wrt = writable.get(wrtKey);
                if (wrt instanceof DocumentPutWritable) {
                    DocumentPutWritable docBytes = (DocumentPutWritable) wrt;
                    String globalId = docBytes.getGlobalId();
                    int docId = docBytes.getDocId();
                    Put mapping = new Put(Bytes.toBytes(globalId));
                    mapping.add(Bytes.toBytes("docId"), Bytes.toBytes(""), Bytes.toBytes(docId));
                    context.write(new ImmutableBytesWritable(SolbaseUtil.docKeyIdMapTable), mapping);
                    context.getCounter(Counters.TOTAL_DOC_KEY_ID_MAP).increment(1);
                    List<String> fieldKeys = docBytes.getFieldKeys();
                    List<byte[]> fieldValues = docBytes.getFieldValues();
                    List<Term> allTerms = docBytes.getAllTerms();
                    byte[] md5DocId = SolbaseUtil.randomize(docId);
                    Put documentPut = new Put(md5DocId);
                    // Store each field as a column under this docId
                    for (int i = 0; i < fieldKeys.size(); i++) {
                        String fieldKey = fieldKeys.get(i);
                        byte[] fieldValue = fieldValues.get(i);
                        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(fieldKey), fieldValue);
                    }
                    // Finally, Store meta-data so we can delete this
                    // document
                    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allTerms).array());
                    context.write(new ImmutableBytesWritable(SolbaseUtil.docTable), documentPut);
                    context.getCounter(Counters.TOTAL_DOCS).increment(1);
                    counter++;
                } else if (wrt instanceof TermDocMetadataWritable) {
                    // gather all of docs given field key (field/value)
                    TermDocMetadataWritable metadata = (TermDocMetadataWritable) wrt;
                    // convert key to byte array
                    // byte[] fieldTermKey = key.getBytes();
                    byte[] termValue = metadata.getTermDocMetadata();
                    _key = metadata.getFieldTermKey();
                    int docId = metadata.getDocId();
                    Put put = null;
                    switch(TermDocMetadataLoader.storageType) {
                        case KEY_ONLY:
                            {
                                put = new Put(Bytes.add(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)), termValue));
                                put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), Bytes.toBytes(""));
                            }
                            break;
                        case WIDE_ROW:
                            int chunkId = TermDocMetadataLoader.getChunkId(docId);
                            put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(chunkId)));
                            put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(docId), termValue);
                            break;
                        case NARROW_ROW:
                        default:
                            {
                                put = new Put(Bytes.add(_key, SolbaseUtil.delimiter, Bytes.toBytes(docId)));
                                put.add(SolbaseUtil.termVectorDocColumnFamilyName, Bytes.toBytes(""), termValue);
                            }
                    }
                    context.write(new ImmutableBytesWritable(SolbaseUtil.termVectorTable), put);
                    context.getCounter(Counters.TOTAL_TERM_VECTORS).increment(1);
                    counter++;
                } else {
                    System.out.println("else: " + writable.getClass());
                    context.getCounter(Counters.TOTAL_INVALID).increment(1);
                }
            }
        } else {
            dupCount++;
        }
    }
    context.getCounter(Counters.DUPLICATE_ROWS).increment(dupCount);
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Writable(org.apache.hadoop.io.Writable) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable) MapWritable(org.apache.hadoop.io.MapWritable) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Term(org.apache.lucene.index.Term) Put(org.apache.hadoop.hbase.client.Put) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Example 4 with MapWritable

use of org.apache.hadoop.io.MapWritable in project Solbase by Photobucket.

the class SolbaseInitialIndexMapper method map.

protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
    context.getCounter(Counters.TOTAL_ROWS).increment(1);
    context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
    // global id is user_media row key
    String globalId = Bytes.toString(row.get());
    Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
    byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
    if (doc == null) {
        // validation must have failed if it returned null
        return;
    }
    // exists already
    if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
        docId = SolbaseUtil.generateUniqId();
        this.idCounter = 0;
    } else {
        docId--;
    }
    // for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
    // it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
    indexerUtil.addFieldToDoc(doc, "docId", docId + "");
    // incrementing chunking sequence (lucene doc id)
    this.idCounter++;
    try {
        ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
        List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
        MapWritable mapWritable = new MapWritable();
        DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
        mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
        for (TermDocMetadata metadata : metadatas) {
            byte[] key = metadata.getFieldTermKey();
            ByteBuffer buf = metadata.serialize();
            TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
            mapWritable.put(new BytesWritable(key), writable);
        }
        context.write(new BytesWritable(checksum), mapWritable);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
Also used : ParsedDoc(org.solbase.indexer.ParsedDoc) TermDocMetadataWritable(org.solbase.indexer.writable.TermDocMetadataWritable) TermDocMetadata(org.solbase.lucenehbase.TermDocMetadata) BytesWritable(org.apache.hadoop.io.BytesWritable) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) MapWritable(org.apache.hadoop.io.MapWritable) Document(org.apache.lucene.document.Document) ByteBuffer(java.nio.ByteBuffer) DocumentPutWritable(org.solbase.indexer.writable.DocumentPutWritable)

Example 5 with MapWritable

use of org.apache.hadoop.io.MapWritable in project hive by apache.

the class TypedBytesWritableInput method readMap.

public MapWritable readMap(MapWritable mw) throws IOException {
    if (mw == null) {
        mw = new MapWritable();
    }
    int length = in.readMapHeader();
    for (int i = 0; i < length; i++) {
        Writable key = read();
        Writable value = read();
        mw.put(key, value);
    }
    return mw;
}
Also used : ByteWritable(org.apache.hadoop.hive.serde2.io.ByteWritable) NullWritable(org.apache.hadoop.io.NullWritable) VLongWritable(org.apache.hadoop.io.VLongWritable) Writable(org.apache.hadoop.io.Writable) MapWritable(org.apache.hadoop.io.MapWritable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) ArrayWritable(org.apache.hadoop.io.ArrayWritable) IntWritable(org.apache.hadoop.io.IntWritable) SortedMapWritable(org.apache.hadoop.io.SortedMapWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) VIntWritable(org.apache.hadoop.io.VIntWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) MapWritable(org.apache.hadoop.io.MapWritable) SortedMapWritable(org.apache.hadoop.io.SortedMapWritable)

Aggregations

MapWritable (org.apache.hadoop.io.MapWritable)18 Text (org.apache.hadoop.io.Text)10 Writable (org.apache.hadoop.io.Writable)9 BytesWritable (org.apache.hadoop.io.BytesWritable)5 IOException (java.io.IOException)4 MalformedURLException (java.net.MalformedURLException)3 URL (java.net.URL)3 ArrayWritable (org.apache.hadoop.io.ArrayWritable)3 BooleanWritable (org.apache.hadoop.io.BooleanWritable)3 LongWritable (org.apache.hadoop.io.LongWritable)3 NullWritable (org.apache.hadoop.io.NullWritable)3 Test (org.junit.Test)3 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Properties (java.util.Properties)2 InputTableConfig (org.apache.accumulo.core.client.mapreduce.InputTableConfig)2 ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)2 ByteWritable (org.apache.hadoop.io.ByteWritable)2 DoubleWritable (org.apache.hadoop.io.DoubleWritable)2 FloatWritable (org.apache.hadoop.io.FloatWritable)2