Search in sources :

Example 1 with TableRowkeyPair

use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.

the class MultiHfileOutputFormat method configureIncrementalLoad.

/**
 * Configures the job for MultiHfileOutputFormat.
 * @param job
 * @param tablesToBeLoaded
 * @throws IOException
 */
public static void configureIncrementalLoad(Job job, List<TargetTableRef> tablesToBeLoaded) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setOutputFormatClass(MultiHfileOutputFormat.class);
    conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName());
    // tableStartKeys for all tables.
    Set<TableRowkeyPair> tablesStartKeys = Sets.newTreeSet();
    for (TargetTableRef table : tablesToBeLoaded) {
        final String tableName = table.getPhysicalName();
        try (HTable htable = new HTable(conf, tableName)) {
            Set<TableRowkeyPair> startKeys = getRegionStartKeys(tableName, htable.getRegionLocator());
            tablesStartKeys.addAll(startKeys);
            String compressionConfig = configureCompression(htable.getTableDescriptor());
            String bloomTypeConfig = configureBloomType(htable.getTableDescriptor());
            String blockSizeConfig = configureBlockSize(htable.getTableDescriptor());
            String blockEncodingConfig = configureDataBlockEncoding(htable.getTableDescriptor());
            Map<String, String> tableConfigs = Maps.newHashMap();
            if (StringUtils.isNotBlank(compressionConfig)) {
                tableConfigs.put(COMPRESSION_FAMILIES_CONF_KEY, compressionConfig);
            }
            if (StringUtils.isNotBlank(bloomTypeConfig)) {
                tableConfigs.put(BLOOM_TYPE_FAMILIES_CONF_KEY, bloomTypeConfig);
            }
            if (StringUtils.isNotBlank(blockSizeConfig)) {
                tableConfigs.put(BLOCK_SIZE_FAMILIES_CONF_KEY, blockSizeConfig);
            }
            if (StringUtils.isNotBlank(blockEncodingConfig)) {
                tableConfigs.put(DATABLOCK_ENCODING_FAMILIES_CONF_KEY, blockEncodingConfig);
            }
            table.setConfiguration(tableConfigs);
            final String tableDefns = TargetTableRefFunctions.TO_JSON.apply(table);
            // set the table definition in the config to be used during the RecordWriter..
            conf.set(tableName, tableDefns);
            TargetTableRef tbl = TargetTableRefFunctions.FROM_JSON.apply(tableDefns);
            LOG.info(" the table logical name is " + tbl.getLogicalName());
        }
    }
    LOG.info("Configuring " + tablesStartKeys.size() + " reduce partitions to match current region count");
    job.setNumReduceTasks(tablesStartKeys.size());
    configurePartitioner(job, tablesStartKeys);
    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.initCredentials(job);
}
Also used : TableRowkeyPair(org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair) Configuration(org.apache.hadoop.conf.Configuration) TargetTableRef(org.apache.phoenix.mapreduce.bulkload.TargetTableRef) MutationSerialization(org.apache.hadoop.hbase.mapreduce.MutationSerialization) ResultSerialization(org.apache.hadoop.hbase.mapreduce.ResultSerialization) KeyValueSerialization(org.apache.hadoop.hbase.mapreduce.KeyValueSerialization) HTable(org.apache.hadoop.hbase.client.HTable)

Example 2 with TableRowkeyPair

use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.

the class MultiHfileOutputFormat method getRegionStartKeys.

/**
 * Return the start keys of all of the regions in this table,
 * as a list of ImmutableBytesWritable.
 */
private static Set<TableRowkeyPair> getRegionStartKeys(String tableName, RegionLocator table) throws IOException {
    byte[][] byteKeys = table.getStartKeys();
    Set<TableRowkeyPair> ret = new TreeSet<TableRowkeyPair>();
    for (byte[] byteKey : byteKeys) {
        // phoenix-2216: start : passing the table name and startkey
        ret.add(new TableRowkeyPair(tableName, new ImmutableBytesWritable(byteKey)));
    }
    return ret;
}
Also used : TableRowkeyPair(org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) TreeSet(java.util.TreeSet)

Example 3 with TableRowkeyPair

use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.

the class FormatToBytesWritableMapper method writeAggregatedRow.

/**
 * Collect all column values for the same Row. RowKey may be different if indexes are involved,
 * so it writes a separate record for each unique RowKey
 *
 * @param context    Current mapper context
 * @param tableName Table index in tableNames list
 * @param lkv        List of KV values that will be combined in a single ImmutableBytesWritable
 * @throws IOException
 * @throws InterruptedException
 */
private void writeAggregatedRow(Context context, String tableName, List<KeyValue> lkv) throws IOException, InterruptedException {
    ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
    DataOutputStream outputStream = new DataOutputStream(bos);
    ImmutableBytesWritable outputKey = null;
    if (!lkv.isEmpty()) {
        for (KeyValue cell : lkv) {
            if (outputKey == null || Bytes.compareTo(outputKey.get(), outputKey.getOffset(), outputKey.getLength(), cell.getRowArray(), cell.getRowOffset(), cell.getRowLength()) != 0) {
                // This a the first RowKey or a different from previous
                if (outputKey != null) {
                    // It's a different RowKey, so we need to write it
                    ImmutableBytesWritable aggregatedArray = new ImmutableBytesWritable(bos.toByteArray());
                    outputStream.close();
                    context.write(new TableRowkeyPair(tableName, outputKey), aggregatedArray);
                }
                outputKey = new ImmutableBytesWritable(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength());
                bos = new ByteArrayOutputStream(1024);
                outputStream = new DataOutputStream(bos);
            }
            /*
                The order of aggregation: type, index of column, length of value, value itself
                 */
            int i = findIndex(cell);
            if (i == -1) {
                // we skip those KVs that are not belongs to loca index
                continue;
            }
            outputStream.writeByte(cell.getTypeByte());
            WritableUtils.writeVLong(outputStream, cell.getTimestamp());
            WritableUtils.writeVInt(outputStream, i);
            WritableUtils.writeVInt(outputStream, cell.getValueLength());
            outputStream.write(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
        }
        ImmutableBytesWritable aggregatedArray = new ImmutableBytesWritable(bos.toByteArray());
        outputStream.close();
        context.write(new TableRowkeyPair(tableName, outputKey), aggregatedArray);
    }
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) KeyValue(org.apache.hadoop.hbase.KeyValue) TableRowkeyPair(org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 4 with TableRowkeyPair

use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.

the class MultiHfileOutputFormat method writePartitions.

private static void writePartitions(Configuration conf, Path partitionsPath, Set<TableRowkeyPair> tablesStartKeys) throws IOException {
    LOG.info("Writing partition information to " + partitionsPath);
    if (tablesStartKeys.isEmpty()) {
        throw new IllegalArgumentException("No regions passed");
    }
    // We're generating a list of split points, and we don't ever
    // have keys < the first region (which has an empty start key)
    // so we need to remove it. Otherwise we would end up with an
    // empty reducer with index 0
    TreeSet<TableRowkeyPair> sorted = new TreeSet<TableRowkeyPair>(tablesStartKeys);
    TableRowkeyPair first = sorted.first();
    if (!first.getRowkey().equals(HConstants.EMPTY_BYTE_ARRAY)) {
        throw new IllegalArgumentException("First region of table should have empty start key. Instead has: " + Bytes.toStringBinary(first.getRowkey().get()));
    }
    sorted.remove(first);
    // Write the actual file
    FileSystem fs = partitionsPath.getFileSystem(conf);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, partitionsPath, TableRowkeyPair.class, NullWritable.class);
    try {
        for (TableRowkeyPair startKey : sorted) {
            writer.append(startKey, NullWritable.get());
        }
    } finally {
        writer.close();
    }
}
Also used : TableRowkeyPair(org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair) SequenceFile(org.apache.hadoop.io.SequenceFile) TreeSet(java.util.TreeSet) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 5 with TableRowkeyPair

use of org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair in project phoenix by apache.

the class MultiHfileOutputFormat method createRecordWriter.

/**
 * @param context
 * @return
 * @throws IOException
 */
static <V extends Cell> RecordWriter<TableRowkeyPair, V> createRecordWriter(final TaskAttemptContext context) throws IOException {
    // Get the path of the temporary output file
    final Path outputPath = FileOutputFormat.getOutputPath(context);
    final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath();
    final Configuration conf = context.getConfiguration();
    final FileSystem fs = outputdir.getFileSystem(conf);
    final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE);
    // Invented config.  Add to hbase-*.xml if other than default compression.
    final String defaultCompressionStr = conf.get("hfile.compression", Compression.Algorithm.NONE.getName());
    final Algorithm defaultCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr);
    final boolean compactionExclude = conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false);
    return new RecordWriter<TableRowkeyPair, V>() {

        // Map of families to writers and how much has been output on the writer.
        private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(Bytes.BYTES_COMPARATOR);

        private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;

        private final byte[] now = Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis());

        private boolean rollRequested = false;

        @Override
        public void write(TableRowkeyPair row, V cell) throws IOException {
            KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
            // null input == user explicitly wants to flush
            if (row == null && kv == null) {
                rollWriters();
                return;
            }
            // phoenix-2216: start : extract table name from the rowkey
            String tableName = row.getTableName();
            byte[] rowKey = row.getRowkey().get();
            long length = kv.getLength();
            byte[] family = CellUtil.cloneFamily(kv);
            byte[] tableAndFamily = join(tableName, Bytes.toString(family));
            WriterLength wl = this.writers.get(tableAndFamily);
            // If this is a new column family, verify that the directory exists
            if (wl == null) {
                // phoenix-2216: start : create a directory for table and family within the output dir
                Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName);
                fs.mkdirs(new Path(tableOutputPath, Bytes.toString(family)));
            // phoenix-2216: end
            }
            // maxsize, we need to roll all the writers
            if (wl != null && wl.written + length >= maxsize) {
                this.rollRequested = true;
            }
            // This can only happen once a row is finished though
            if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) {
                rollWriters();
            }
            // create a new WAL writer, if necessary
            if (wl == null || wl.writer == null) {
                // phoenix-2216: start : passed even the table name
                wl = getNewWriter(tableName, family, conf);
            // phoenix-2216: end
            }
            // we now have the proper WAL writer. full steam ahead
            kv.updateLatestStamp(this.now);
            wl.writer.append(kv);
            wl.written += length;
            // Copy the row so we know when a row transition.
            this.previousRow = rowKey;
        }

        private void rollWriters() throws IOException {
            for (WriterLength wl : this.writers.values()) {
                if (wl.writer != null) {
                    LOG.info("Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written));
                    close(wl.writer);
                }
                wl.writer = null;
                wl.written = 0;
            }
            this.rollRequested = false;
        }

        /* Create a new StoreFile.Writer.
           * @param family
           * @return A WriterLength, containing a new StoreFile.Writer.
           * @throws IOException
           */
        @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "BX_UNBOXING_IMMEDIATELY_REBOXED", justification = "Not important")
        private WriterLength getNewWriter(final String tableName, byte[] family, Configuration conf) throws IOException {
            WriterLength wl = new WriterLength();
            Path tableOutputPath = CsvBulkImportUtil.getOutputPath(outputdir, tableName);
            Path familydir = new Path(tableOutputPath, Bytes.toString(family));
            // phoenix-2216: start : fetching the configuration properties that were set to the table.
            // create a map from column family to the compression algorithm for the table.
            final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf, tableName);
            final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf, tableName);
            final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf, tableName);
            // phoenix-2216: end
            String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY);
            final Map<byte[], DataBlockEncoding> datablockEncodingMap = createFamilyDataBlockEncodingMap(conf, tableName);
            final DataBlockEncoding overriddenEncoding;
            if (dataBlockEncodingStr != null) {
                overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr);
            } else {
                overriddenEncoding = null;
            }
            Algorithm compression = compressionMap.get(family);
            compression = compression == null ? defaultCompression : compression;
            BloomType bloomType = bloomTypeMap.get(family);
            bloomType = bloomType == null ? BloomType.NONE : bloomType;
            Integer blockSize = blockSizeMap.get(family);
            blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize;
            DataBlockEncoding encoding = overriddenEncoding;
            encoding = encoding == null ? datablockEncodingMap.get(family) : encoding;
            encoding = encoding == null ? DataBlockEncoding.NONE : encoding;
            Configuration tempConf = new Configuration(conf);
            tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f);
            HFileContextBuilder contextBuilder = new HFileContextBuilder().withCompression(compression).withChecksumType(HStore.getChecksumType(conf)).withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)).withBlockSize(blockSize);
            contextBuilder.withDataBlockEncoding(encoding);
            HFileContext hFileContext = contextBuilder.build();
            wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs).withOutputDir(familydir).withBloomType(bloomType).withComparator(KeyValue.COMPARATOR).withFileContext(hFileContext).build();
            // join and put it in the writers map .
            // phoenix-2216: start : holds a map of writers where the
            // key in the map is a join byte array of table name and family.
            byte[] tableAndFamily = join(tableName, Bytes.toString(family));
            this.writers.put(tableAndFamily, wl);
            // phoenix-2216: end
            return wl;
        }

        private void close(final StoreFile.Writer w) throws IOException {
            if (w != null) {
                w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(EnvironmentEdgeManager.currentTimeMillis()));
                w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString()));
                w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true));
                w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude));
                w.appendTrackedTimestampsToMetadata();
                w.close();
            }
        }

        @Override
        public void close(TaskAttemptContext c) throws IOException, InterruptedException {
            for (WriterLength wl : this.writers.values()) {
                close(wl.writer);
            }
        }
    };
}
Also used : DataBlockEncoding(org.apache.hadoop.hbase.io.encoding.DataBlockEncoding) KeyValue(org.apache.hadoop.hbase.KeyValue) Configuration(org.apache.hadoop.conf.Configuration) HFileContextBuilder(org.apache.hadoop.hbase.io.hfile.HFileContextBuilder) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) TableRowkeyPair(org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair) FileSystem(org.apache.hadoop.fs.FileSystem) CacheConfig(org.apache.hadoop.hbase.io.hfile.CacheConfig) Path(org.apache.hadoop.fs.Path) FileOutputCommitter(org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Algorithm(org.apache.hadoop.hbase.io.compress.Compression.Algorithm) HFileContext(org.apache.hadoop.hbase.io.hfile.HFileContext) BloomType(org.apache.hadoop.hbase.regionserver.BloomType) Map(java.util.Map) TreeMap(java.util.TreeMap) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) AbstractHFileWriter(org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter)

Aggregations

TableRowkeyPair (org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair)5 TreeSet (java.util.TreeSet)2 Configuration (org.apache.hadoop.conf.Configuration)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 KeyValue (org.apache.hadoop.hbase.KeyValue)2 ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 Path (org.apache.hadoop.fs.Path)1 HTable (org.apache.hadoop.hbase.client.HTable)1 Algorithm (org.apache.hadoop.hbase.io.compress.Compression.Algorithm)1 DataBlockEncoding (org.apache.hadoop.hbase.io.encoding.DataBlockEncoding)1 AbstractHFileWriter (org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter)1 CacheConfig (org.apache.hadoop.hbase.io.hfile.CacheConfig)1 HFileContext (org.apache.hadoop.hbase.io.hfile.HFileContext)1 HFileContextBuilder (org.apache.hadoop.hbase.io.hfile.HFileContextBuilder)1 KeyValueSerialization (org.apache.hadoop.hbase.mapreduce.KeyValueSerialization)1 MutationSerialization (org.apache.hadoop.hbase.mapreduce.MutationSerialization)1