use of org.apache.hadoop.mapreduce.InputSplit in project carbondata by apache.
the class CarbonInputFormat method getSplits.
/**
* {@inheritDoc}
* Configurations FileInputFormat.INPUT_DIR
* are used to get table path to read.
*
* @param job
* @return List<InputSplit> list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
AbsoluteTableIdentifier identifier = getAbsoluteTableIdentifier(job.getConfiguration());
CacheClient cacheClient = new CacheClient(identifier.getStorePath());
try {
List<String> invalidSegments = new ArrayList<>();
List<UpdateVO> invalidTimestampsList = new ArrayList<>();
// get all valid segments and set them into the configuration
if (getSegmentsToAccess(job).length == 0) {
SegmentStatusManager segmentStatusManager = new SegmentStatusManager(identifier);
SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments();
SegmentUpdateStatusManager updateStatusManager = new SegmentUpdateStatusManager(identifier);
setSegmentsToAccess(job.getConfiguration(), segments.getValidSegments());
if (segments.getValidSegments().size() == 0) {
return new ArrayList<>(0);
}
// remove entry in the segment index if there are invalid segments
invalidSegments.addAll(segments.getInvalidSegments());
for (String invalidSegmentId : invalidSegments) {
invalidTimestampsList.add(updateStatusManager.getInvalidTimestampRange(invalidSegmentId));
}
if (invalidSegments.size() > 0) {
List<TableSegmentUniqueIdentifier> invalidSegmentsIds = new ArrayList<>(invalidSegments.size());
for (String segId : invalidSegments) {
invalidSegmentsIds.add(new TableSegmentUniqueIdentifier(identifier, segId));
}
cacheClient.getSegmentAccessClient().invalidateAll(invalidSegmentsIds);
}
}
// process and resolve the expression
Expression filter = getFilterPredicates(job.getConfiguration());
CarbonTable carbonTable = getCarbonTable(job.getConfiguration());
// this will be null in case of corrupt schema file.
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
CarbonInputFormatUtil.processFilterExpression(filter, carbonTable);
// prune partitions for filter query on partition table
BitSet matchedPartitions = null;
if (null != filter) {
PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getFactTableName());
if (null != partitionInfo) {
Partitioner partitioner = PartitionUtil.getPartitioner(partitionInfo);
matchedPartitions = new FilterExpressionProcessor().getFilteredPartitions(filter, partitionInfo, partitioner);
if (matchedPartitions.cardinality() == 0) {
// no partition is required
return new ArrayList<InputSplit>();
}
if (matchedPartitions.cardinality() == partitioner.numPartitions()) {
// all partitions are required, no need to prune partitions
matchedPartitions = null;
}
}
}
FilterResolverIntf filterInterface = CarbonInputFormatUtil.resolveFilter(filter, identifier);
// do block filtering and get split
List<InputSplit> splits = getSplits(job, filterInterface, matchedPartitions, cacheClient);
// pass the invalid segment to task side in order to remove index entry in task side
if (invalidSegments.size() > 0) {
for (InputSplit split : splits) {
((CarbonInputSplit) split).setInvalidSegments(invalidSegments);
((CarbonInputSplit) split).setInvalidTimestampRange(invalidTimestampsList);
}
}
return splits;
} finally {
// close the cache cache client to clear LRU cache memory
cacheClient.close();
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project carbondata by apache.
the class CarbonInputFormat method getSplitsInternal.
private List<InputSplit> getSplitsInternal(JobContext job) throws IOException {
List<InputSplit> splits = super.getSplits(job);
List<InputSplit> carbonSplits = new ArrayList<InputSplit>(splits.size());
// identify table blocks
for (InputSplit inputSplit : splits) {
FileSplit fileSplit = (FileSplit) inputSplit;
String segmentId = CarbonTablePath.DataPathUtil.getSegmentId(fileSplit.getPath().toString());
if (segmentId.equals(CarbonCommonConstants.INVALID_SEGMENT_ID)) {
continue;
}
carbonSplits.add(CarbonInputSplit.from(segmentId, fileSplit, ColumnarFormatVersion.valueOf(CarbonCommonConstants.CARBON_DATA_FILE_DEFAULT_VERSION)));
}
return carbonSplits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project carbondata by apache.
the class IndexedSegment method getSplits.
@Override
public List<InputSplit> getSplits(JobContext job, FilterResolverIntf filterResolver) throws IOException {
// do as following
// 1. create the index or get from cache by the filter name in the configuration
// 2. filter by index to get the filtered block
// 3. create input split from filtered block
List<InputSplit> output = new LinkedList<>();
Index index = loader.load(this);
List<Block> blocks = index.filter(job, filterResolver);
for (Block block : blocks) {
output.add(makeInputSplit(block));
}
return output;
}
use of org.apache.hadoop.mapreduce.InputSplit in project gora by apache.
the class FileBackedDataStoreBase method getPartitions.
@Override
public List<PartitionQuery<K, T>> getPartitions(Query<K, T> query) {
List<InputSplit> splits = null;
List<PartitionQuery<K, T>> queries = null;
try {
splits = GoraMapReduceUtils.getSplits(getConf(), inputPath);
queries = new ArrayList<>(splits.size());
for (InputSplit split : splits) {
queries.add(new FileSplitPartitionQuery<>(query, (FileSplit) split));
}
} catch (IOException ex) {
LOG.error(ex.getMessage(), ex);
}
return queries;
}
use of org.apache.hadoop.mapreduce.InputSplit in project gora by apache.
the class TestGoraInputFormat method testGetSplits.
/**
* First, asserts that the attempt to obtain splits results in
* greater than 0 splits which can be used for computation.
* We then check that the partition query (obtained by using the
* splits) has the same fields as we would expect by directly
* accessing the fields of an Employee object.
* @throws IOException
* @throws InterruptedException
*/
@Test
@SuppressWarnings("rawtypes")
public void testGetSplits() throws IOException, InterruptedException {
List<InputSplit> splits = getInputSplits();
assertTrue(splits.size() > 0);
InputSplit split = splits.get(0);
PartitionQuery query = ((GoraInputSplit) split).getQuery();
assertTrue(Arrays.equals(getEmployeeFieldNames(), query.getFields()));
}
Aggregations